[llvm-branch-commits] [clang] [clang-tools-extra] [llvm] [lldb] [flang] [mlir] [libcxx] [compiler-rt] [openmp] [lld] Refactor ModuleToObject to offer more flexibility to subclass (NFC) (PR #71165)

Fri Nov 3 13:41:34 PDT 2023

https://github.com/joker-eph updated https://github.com/llvm/llvm-project/pull/71165

>From fd82b5b2876b3885b0590ba4538c316fa0e33cf7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 3 Nov 2023 10:21:13 +0000
Subject: [PATCH 01/76] [LV] Support recieps without underlying instr in
 collectPoisonGenRec.

Support recipes without underlying instruction in
collectPoisonGeneratingRecipes by directly trying to dyn_cast_or_null
the underlying value.

Fixes https://github.com/llvm/llvm-project/issues/70590.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  3 +-
 .../X86/drop-poison-generating-flags.ll       | 83 +++++++++++++++++++
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4f547886f602534..1c208f72af678f7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1103,7 +1103,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
         RecWithFlags->dropPoisonGeneratingFlags();
       } else {
-        Instruction *Instr = CurRec->getUnderlyingInstr();
+        Instruction *Instr = dyn_cast_or_null<Instruction>(
+            CurRec->getVPSingleValue()->getUnderlyingValue());
         (void)Instr;
         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
                "found instruction with poison generating flags not covered by "
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index b440da6dd866081..5694367dd1f9016 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -405,6 +405,89 @@ loop.exit:
   ret void
 }
 
+ at c = external global [5 x i8]
+
+; Test case for https://github.com/llvm/llvm-project/issues/70590.
+; Note that the then block has UB, but I could not find any other way to
+; construct a suitable test case.
+define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
+; CHECK-LABEL: @pr70590_recipe_without_underlying_instr(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.+]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_SREM_CONTINUE6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]],
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
+; CHECK:       pred.srem.if:
+; CHECK-NEXT:    [[TMP4:%.*]] = srem i64 3, 0
+; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE]]
+; CHECK:       pred.srem.continue:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i64 [ poison, %vector.body ], [ [[TMP4]], [[PRED_SREM_IF]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_SREM_IF1:%.*]], label [[PRED_SREM_CONTINUE2:%.*]]
+; CHECK:       pred.srem.if1:
+; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 3, 0
+; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE2]]
+; CHECK:       pred.srem.continue2:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i64 [ poison, [[PRED_SREM_CONTINUE]] ], [ [[TMP7]], [[PRED_SREM_IF1]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_SREM_IF3:%.*]], label [[PRED_SREM_CONTINUE4:%.*]]
+; CHECK:       pred.srem.if3:
+; CHECK-NEXT:    [[TMP10:%.*]] = srem i64 3, 0
+; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE4]]
+; CHECK:       pred.srem.continue4:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ poison, [[PRED_SREM_CONTINUE2]] ], [ [[TMP10]], [[PRED_SREM_IF3]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_SREM_IF5:%.*]], label [[PRED_SREM_CONTINUE6]]
+; CHECK:       pred.srem.if5:
+; CHECK-NEXT:    [[TMP13:%.*]] = srem i64 3, 0
+; CHECK-NEXT:    br label [[PRED_SREM_CONTINUE6]]
+; CHECK:       pred.srem.continue6:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i64 [ poison, [[PRED_SREM_CONTINUE4]] ], [ [[TMP13]], [[PRED_SREM_IF5]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP5]], -3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP0]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i8> [[WIDE_LOAD]], <4 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr %dst, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label %middle.block, label %vector.body
+; CHECK:       middle.block:
+
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %inc, %loop.latch ]
+  %cmp = icmp eq i64 %iv, %n
+  br i1 %cmp, label %loop.latch, label %then
+
+then:
+  %rem = srem i64 3, 0
+  %add3 = add i64 %rem, -3
+  %add5 = add i64 %iv, %add3
+  %gep = getelementptr [5 x i8], ptr @c, i64 0, i64 %add5
+  %l = load i8, ptr %gep, align 1
+  br label %loop.latch
+
+loop.latch:
+  %sr = phi i8 [ 0, %loop.header ], [ %l , %then ]
+  %gep.dst = getelementptr i8, ptr %dst, i64 %iv
+  store i8 %sr, ptr %gep.dst, align 4
+  %inc = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %inc, 4
+  br i1 %exitcond.not, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
 attributes #0 = { noinline nounwind uwtable "target-features"="+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl" }
 
 !0 = !{}

>From 8fd43fddd3b1c1dba9d59a10268a6cd71c2e3504 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 3 Nov 2023 10:25:04 +0000
Subject: [PATCH 02/76] [mlir][ArmSME] Provide descriptions and summaries for
 intrinsic types (#71057)

Follow on for some types missed in #70920. This also replaces the
LDSTPredicate with SVEPredicate (as they are equivalent), and adds a
missing rank == 1 checks to the SVE vector types.

A FIXME is also added to point out an issue in the MOPVector type
constraint.
---
 .../Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td   | 42 ++++++++++++++++---
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index e369ef203ad39d6..c86a73812a5899c 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -19,10 +19,40 @@ include "ArmSME.td"
 // ArmSME Intrinsic op definitions
 //===----------------------------------------------------------------------===//
 
-def MOPPredicate : ScalableVectorOfLengthAndType<[16, 8, 4, 2], [I1]>;
-def MOPVector : ScalableVectorOfLengthAndType<[16, 8, 4, 2],
-                                              [I8, I16, BF16, F16, F32, F64]>;
-def LDSTPredicate : ScalableVectorOfLengthAndType<[16, 8, 4, 2, 1], [I1]>;
+def MOPPredicate : ScalableVectorOfRankAndLengthAndType<[1], [16, 8, 4, 2], [I1]>
+{
+  let summary = "a vector type that is a supported predicate for the SME MOP instructions";
+  let description = [{
+    Possible vector types:
+
+    * `vector<[16]xi1>`
+    * `vector<[8]xi1>`
+    * `vector<[4]xi1>`
+    * `vector<[2]xi1>`
+  }];
+}
+
+// FIXME: This allows types that are not SVE vectors, e.g. vector<[16]xf32>.
+def MOPVector : ScalableVectorOfRankAndLengthAndType<[1], [16, 8, 4, 2],
+                                              [I8, I16, BF16, F16, F32, F64]>
+{
+  let summary = "a vector type that is a supported input for the SME MOP instructions";
+  let description = [{
+    Possible vector types:
+
+    Integer elements:
+
+    * `vector<[16]xi8>`
+    * `vector<[8]xi16>`
+
+    Floating point elements:
+
+    * `vector<[8]xf16>`
+    * `vector<[8]xbf16>`
+    * `vector<[4]xf32>`
+    * `vector<[2]xf64>`
+  }];
+}
 
 class ArmSME_IntrOp<string mnemonic, list<int> overloadedOperands = [],
                     list<Trait> traits = [], int numResults = 0,
@@ -65,7 +95,7 @@ def LLVM_aarch64_sme_usmops_wide : ArmSME_IntrMopOverloadedOp<"usmops.wide">;
 // Loads
 class ArmSME_IntrLoadOp<string mnemonic>
     : ArmSME_IntrOp<mnemonic>,
-      Arguments<(ins Arg<LDSTPredicate, "Vector predicate">:$predicate,
+      Arguments<(ins Arg<SVEPredicate, "Vector predicate">:$predicate,
                  Arg<LLVM_AnyPointer, "Load address">:$load_address,
                  Arg<I32, "Virtual tile ID">:$tile_id,
                  Arg<I32, "Tile slice">:$tile_slice_index)>;
@@ -84,7 +114,7 @@ def LLVM_aarch64_sme_ld1q_vert : ArmSME_IntrLoadOp<"ld1q.vert">;
 // Stores
 class ArmSME_IntrStoreOp<string mnemonic>
     : ArmSME_IntrOp<mnemonic>,
-      Arguments<(ins Arg<LDSTPredicate, "Vector predicate">:$predicate,
+      Arguments<(ins Arg<SVEPredicate, "Vector predicate">:$predicate,
                  Arg<LLVM_AnyPointer, "Store address", [MemWrite]>:$store_address,
                  Arg<I32, "Virtual tile ID">:$tile_id,
                  Arg<I32, "Tile slice">:$tile_slice_index)>;

>From 85f79233286c78a1b79d01ca0677230658732b35 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 11:17:26 +0100
Subject: [PATCH 03/76] [ConstantFold] Remove unnecessary cast of zero gep fold
 (NFCI)

Zero GEPs will be optimized away entirely, except in the cases
which this transform also excludes (splat, inrange).
---
 llvm/lib/IR/ConstantFold.cpp | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 4651767d0655a06..d69665bd6c56ae6 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -233,28 +233,6 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       // Try hard to fold cast of cast because they are often eliminable.
       if (unsigned newOpc = foldConstantCastPair(opc, CE, DestTy))
         return foldMaybeUndesirableCast(newOpc, CE->getOperand(0), DestTy);
-    } else if (CE->getOpcode() == Instruction::GetElementPtr &&
-               // Do not fold addrspacecast (gep 0, .., 0). It might make the
-               // addrspacecast uncanonicalized.
-               opc != Instruction::AddrSpaceCast &&
-               // Do not fold bitcast (gep) with inrange index, as this loses
-               // information.
-               !cast<GEPOperator>(CE)->getInRangeIndex() &&
-               // Do not fold if the gep type is a vector, as bitcasting
-               // operand 0 of a vector gep will result in a bitcast between
-               // different sizes.
-               !CE->getType()->isVectorTy()) {
-      // If all of the indexes in the GEP are null values, there is no pointer
-      // adjustment going on.  We might as well cast the source pointer.
-      bool isAllNull = true;
-      for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
-        if (!CE->getOperand(i)->isNullValue()) {
-          isAllNull = false;
-          break;
-        }
-      if (isAllNull)
-        // This is casting one pointer type to another, always BitCast
-        return ConstantExpr::getPointerCast(CE->getOperand(0), DestTy);
     }
   }
 

>From c5a1d0ae6a69c7ba39a4386c30e49da0d2a3664e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 11:22:38 +0100
Subject: [PATCH 04/76] Revert "[ConstantFold] Remove unnecessary
 BitCastConstantVector() (NFCI)"

This reverts commit 2182561b7ba675ca87356c02474eecb6ecfaa23f.

The all-ones special case in this case isn't redundant. Will redo
the change while preserving it.
---
 llvm/lib/IR/ConstantFold.cpp | 53 +++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index d69665bd6c56ae6..a263cdcf4008a2a 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -37,6 +37,45 @@ using namespace llvm::PatternMatch;
 //                ConstantFold*Instruction Implementations
 //===----------------------------------------------------------------------===//
 
+/// Convert the specified vector Constant node to the specified vector type.
+/// At this point, we know that the elements of the input vector constant are
+/// all simple integer or FP values.
+static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
+
+  if (CV->isAllOnesValue()) return Constant::getAllOnesValue(DstTy);
+  if (CV->isNullValue()) return Constant::getNullValue(DstTy);
+
+  // Do not iterate on scalable vector. The num of elements is unknown at
+  // compile-time.
+  if (isa<ScalableVectorType>(DstTy))
+    return nullptr;
+
+  // If this cast changes element count then we can't handle it here:
+  // doing so requires endianness information.  This should be handled by
+  // Analysis/ConstantFolding.cpp
+  unsigned NumElts = cast<FixedVectorType>(DstTy)->getNumElements();
+  if (NumElts != cast<FixedVectorType>(CV->getType())->getNumElements())
+    return nullptr;
+
+  Type *DstEltTy = DstTy->getElementType();
+  // Fast path for splatted constants.
+  if (Constant *Splat = CV->getSplatValue()) {
+    return ConstantVector::getSplat(DstTy->getElementCount(),
+                                    ConstantExpr::getBitCast(Splat, DstEltTy));
+  }
+
+  SmallVector<Constant*, 16> Result;
+  Type *Ty = IntegerType::get(CV->getContext(), 32);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *C =
+      ConstantExpr::getExtractElement(CV, ConstantInt::get(Ty, i));
+    C = ConstantExpr::getBitCast(C, DstEltTy);
+    Result.push_back(C);
+  }
+
+  return ConstantVector::get(Result);
+}
+
 /// This function determines which opcode to use to fold two constant cast
 /// expressions together. It uses CastInst::isEliminableCastPair to determine
 /// the opcode. Consequently its just a wrapper around that function.
@@ -75,12 +114,24 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
   // Handle casts from one vector constant to another.  We know that the src
   // and dest type have the same size (otherwise its an illegal cast).
   if (VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
+    if (VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
+      assert(DestPTy->getPrimitiveSizeInBits() ==
+                 SrcTy->getPrimitiveSizeInBits() &&
+             "Not cast between same sized vectors!");
+      SrcTy = nullptr;
+      // First, check for null.  Undef is already handled.
+      if (isa<ConstantAggregateZero>(V))
+        return Constant::getNullValue(DestTy);
+
+      // Handle ConstantVector and ConstantAggregateVector.
+      return BitCastConstantVector(V, DestPTy);
+    }
+
     // Canonicalize scalar-to-vector bitcasts into vector-to-vector bitcasts
     // This allows for other simplifications (although some of them
     // can only be handled by Analysis/ConstantFolding.cpp).
     if (isa<ConstantInt>(V) || isa<ConstantFP>(V))
       return ConstantExpr::getBitCast(ConstantVector::get(V), DestPTy);
-    return nullptr;
   }
 
   // Handle integral constant input.

>From e21532418528f0cbcec4c811ce0ea41e8b3075fe Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Fri, 3 Nov 2023 10:29:33 +0000
Subject: [PATCH 05/76] [flang][StackArrays] skip analysis of very large
 functions (#71047)

The stack arrays pass uses data flow analysis to determine whether heap
allocations are freed on all paths out of the function.

`interp_domain_em_part2` in spec2017 wrf generates over 120k operations,
including almost 5k fir.if operations and over 200 fir.do_loop
operations, all in the same function. The MLIR data flow analysis
framework cannot provide reasonable performance for such cases because
there is a combinatorial explosion in the number of control flow paths
through the function, all of which must be checked to determine if the
heap allocations will be freed.

This patch skips the stack arrays pass for ridiculously large functions
(defined as having more than 1000 fir.allocmem operations). This
threshold is configurable at runtime with a command line argument.

With this patch, compiling this file is more than 80% faster.
---
 flang/lib/Optimizer/Transforms/StackArrays.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index 9b90aed5a17ae73..b51e2aae1a9d510 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -42,6 +42,13 @@ namespace fir {
 
 #define DEBUG_TYPE "stack-arrays"
 
+static llvm::cl::opt<std::size_t> maxAllocsPerFunc(
+    "stack-arrays-max-allocs",
+    llvm::cl::desc("The maximum number of heap allocations to consider in one "
+                   "function before skipping (to save compilation time). Set "
+                   "to 0 for no limit."),
+    llvm::cl::init(1000), llvm::cl::Hidden);
+
 namespace {
 
 /// The state of an SSA value at each program point
@@ -411,6 +418,17 @@ void AllocationAnalysis::processOperation(mlir::Operation *op) {
 mlir::LogicalResult
 StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) {
   assert(mlir::isa<mlir::func::FuncOp>(func));
+  size_t nAllocs = 0;
+  func->walk([&nAllocs](fir::AllocMemOp) { nAllocs++; });
+  // don't bother with the analysis if there are no heap allocations
+  if (nAllocs == 0)
+    return mlir::success();
+  if ((maxAllocsPerFunc != 0) && (nAllocs > maxAllocsPerFunc)) {
+    LLVM_DEBUG(llvm::dbgs() << "Skipping stack arrays for function with "
+                            << nAllocs << " heap allocations");
+    return mlir::success();
+  }
+
   mlir::DataFlowSolver solver;
   // constant propagation is required for dead code analysis, dead code analysis
   // is required to mark blocks live (required for mlir dense dfa)

>From 3f1a86bc27a587d3c56f966e51c2513a3a4f44b5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 11:29:54 +0100
Subject: [PATCH 06/76] [ConstantFold] Add tests for vector bitcast of all ones
 (NFC)

---
 llvm/test/Assembler/ConstantExprFold.ll | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/test/Assembler/ConstantExprFold.ll b/llvm/test/Assembler/ConstantExprFold.ll
index b7e4075ea9e1ede..4ce44d2e55130df 100644
--- a/llvm/test/Assembler/ConstantExprFold.ll
+++ b/llvm/test/Assembler/ConstantExprFold.ll
@@ -31,6 +31,9 @@
 @gep3 = global <2 x ptr> getelementptr(i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 0>)
 @gep4 = global <2 x ptr> getelementptr({ i8 }, <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 0>, <2 x i32> <i32 0, i32 0>)
 
+ at bitcast1 = global <2 x i32> bitcast (<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> to <2 x i32>)
+ at bitcast2 = global <4 x i16> bitcast (<2 x i32> <i32 -1, i32 -1> to <4 x i16>)
+
 
 ; Need a function to make update_test_checks.py work.
 ;.
@@ -48,6 +51,8 @@
 ; CHECK: @[[GEP2:[a-zA-Z0-9_$"\\.-]+]] = global <2 x ptr> undef
 ; CHECK: @[[GEP3:[a-zA-Z0-9_$"\\.-]+]] = global <2 x ptr> zeroinitializer
 ; CHECK: @[[GEP4:[a-zA-Z0-9_$"\\.-]+]] = global <2 x ptr> zeroinitializer
+; CHECK: @[[BITCAST1:[a-zA-Z0-9_$"\\.-]+]] = global <2 x i32> <i32 -1, i32 -1>
+; CHECK: @[[BITCAST2:[a-zA-Z0-9_$"\\.-]+]] = global <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
 ;.
 define void @dummy() {
 ; CHECK-LABEL: @dummy(

>From 4be8a7bda55c5b50832b773b204f75cd26c5979d Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Fri, 3 Nov 2023 10:35:21 +0000
Subject: [PATCH 07/76] [lldb] Fix TestVTableValue on 32 bit

7fbd427f5ebea4a4ebf25747758851875bb7e173 added a test that overwrites
a vtable entry but it uses and expects a 64 bit value. Add the 32 bit
equivalents.
---
 .../API/functionalities/vtable/TestVTableValue.py    | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/functionalities/vtable/TestVTableValue.py b/lldb/test/API/functionalities/vtable/TestVTableValue.py
index 5b243e0646f4c28..1c238ad60739bd9 100644
--- a/lldb/test/API/functionalities/vtable/TestVTableValue.py
+++ b/lldb/test/API/functionalities/vtable/TestVTableValue.py
@@ -132,13 +132,19 @@ def test_overwrite_vtable(self):
         # Overwrite the first entry in the vtable and make sure we can still
         # see the bogus value which should have no summary
         vtable_addr = vtable.GetValueAsUnsigned()
-        data = str("\x01\x01\x01\x01\x01\x01\x01\x01")
+
+        is_64bit = self.process().GetAddressByteSize() == 8
+        data = str(
+            "\x01\x01\x01\x01\x01\x01\x01\x01" if is_64bit else "\x01\x01\x01\x01"
+        )
         error = lldb.SBError()
         process.WriteMemory(vtable_addr, data, error)
 
         scribbled_child = vtable.GetChildAtIndex(0)
-        self.assertEquals(scribbled_child.GetValueAsUnsigned(0),
-                          0x0101010101010101)
+        self.assertEquals(
+            scribbled_child.GetValueAsUnsigned(0),
+            0x0101010101010101 if is_64bit else 0x01010101,
+        )
         self.assertEquals(scribbled_child.GetSummary(), None)
 
     def expected_vtable_addr(self, var: lldb.SBValue) -> int:

>From d49a893cdbea0dd6f8fde7dc9f321b2e0d169bba Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 11:04:12 +0100
Subject: [PATCH 08/76] Reapply [ConstantFold] Remove unnecessary
 BitCastConstantVector() (NFCI)

ConstantFoldCastInstruction() already has generic code to perform
lane-wise casts for vectors. There is no need to repeate it
specifically for bitcasts.

However, we do need to keep the special case for vectors of -1,
which is not handled elsewhere.
---
 llvm/lib/IR/ConstantFold.cpp | 54 ++----------------------------------
 1 file changed, 3 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index a263cdcf4008a2a..d51e9c67592eb95 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -37,45 +37,6 @@ using namespace llvm::PatternMatch;
 //                ConstantFold*Instruction Implementations
 //===----------------------------------------------------------------------===//
 
-/// Convert the specified vector Constant node to the specified vector type.
-/// At this point, we know that the elements of the input vector constant are
-/// all simple integer or FP values.
-static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
-
-  if (CV->isAllOnesValue()) return Constant::getAllOnesValue(DstTy);
-  if (CV->isNullValue()) return Constant::getNullValue(DstTy);
-
-  // Do not iterate on scalable vector. The num of elements is unknown at
-  // compile-time.
-  if (isa<ScalableVectorType>(DstTy))
-    return nullptr;
-
-  // If this cast changes element count then we can't handle it here:
-  // doing so requires endianness information.  This should be handled by
-  // Analysis/ConstantFolding.cpp
-  unsigned NumElts = cast<FixedVectorType>(DstTy)->getNumElements();
-  if (NumElts != cast<FixedVectorType>(CV->getType())->getNumElements())
-    return nullptr;
-
-  Type *DstEltTy = DstTy->getElementType();
-  // Fast path for splatted constants.
-  if (Constant *Splat = CV->getSplatValue()) {
-    return ConstantVector::getSplat(DstTy->getElementCount(),
-                                    ConstantExpr::getBitCast(Splat, DstEltTy));
-  }
-
-  SmallVector<Constant*, 16> Result;
-  Type *Ty = IntegerType::get(CV->getContext(), 32);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *C =
-      ConstantExpr::getExtractElement(CV, ConstantInt::get(Ty, i));
-    C = ConstantExpr::getBitCast(C, DstEltTy);
-    Result.push_back(C);
-  }
-
-  return ConstantVector::get(Result);
-}
-
 /// This function determines which opcode to use to fold two constant cast
 /// expressions together. It uses CastInst::isEliminableCastPair to determine
 /// the opcode. Consequently its just a wrapper around that function.
@@ -114,24 +75,15 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
   // Handle casts from one vector constant to another.  We know that the src
   // and dest type have the same size (otherwise its an illegal cast).
   if (VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
-    if (VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
-      assert(DestPTy->getPrimitiveSizeInBits() ==
-                 SrcTy->getPrimitiveSizeInBits() &&
-             "Not cast between same sized vectors!");
-      SrcTy = nullptr;
-      // First, check for null.  Undef is already handled.
-      if (isa<ConstantAggregateZero>(V))
-        return Constant::getNullValue(DestTy);
-
-      // Handle ConstantVector and ConstantAggregateVector.
-      return BitCastConstantVector(V, DestPTy);
-    }
+    if (V->isAllOnesValue())
+      return Constant::getAllOnesValue(DestTy);
 
     // Canonicalize scalar-to-vector bitcasts into vector-to-vector bitcasts
     // This allows for other simplifications (although some of them
     // can only be handled by Analysis/ConstantFolding.cpp).
     if (isa<ConstantInt>(V) || isa<ConstantFP>(V))
       return ConstantExpr::getBitCast(ConstantVector::get(V), DestPTy);
+    return nullptr;
   }
 
   // Handle integral constant input.

>From 01689175251f2624fb9d077666657aa21e3f7850 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 11:41:14 +0100
Subject: [PATCH 09/76] [ConstantFold] Remove redundant handling for casts of
 null (NFCI)

ConstantFoldCastInstruction() has generic handling for null values
at the top. No need to repeat it for inttoptr and ptrtoint.
---
 llvm/lib/IR/ConstantFold.cpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index d51e9c67592eb95..91bb5b6149f4877 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -301,16 +301,6 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       return ConstantInt::get(FPC->getContext(), IntVal);
     }
     return nullptr; // Can't fold.
-  case Instruction::IntToPtr:   //always treated as unsigned
-    if (V->isNullValue())       // Is it an integral null value?
-      return ConstantPointerNull::get(cast<PointerType>(DestTy));
-    return nullptr;                   // Other pointer types cannot be casted
-  case Instruction::PtrToInt:   // always treated as unsigned
-    // Is it a null pointer value?
-    if (V->isNullValue())
-      return ConstantInt::get(DestTy, 0);
-    // Other pointer types cannot be casted
-    return nullptr;
   case Instruction::UIToFP:
   case Instruction::SIToFP:
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
@@ -359,6 +349,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
   case Instruction::BitCast:
     return FoldBitCast(V, DestTy);
   case Instruction::AddrSpaceCast:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
     return nullptr;
   }
 }

>From 18cc980ca30e5f013542518439da020f45ebe497 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 12:02:12 +0100
Subject: [PATCH 10/76] [ConstantFold] Remove unnecessarily complicated
 evaluateFCmpRelation() (NFCI)

The only thing this actually handles is if the operands are the
same, in which case ueq is returned.

Given that nearly all FP constant expressions have already been
removed, I think it's safe to say that we aren't going to extend
this code in a way that makes use of the more general structure.
---
 llvm/lib/IR/ConstantFold.cpp | 139 ++---------------------------------
 1 file changed, 5 insertions(+), 134 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 91bb5b6149f4877..151bd0c7086aa68 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1112,70 +1112,6 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
   return nullptr;
 }
 
-/// This function determines if there is anything we can decide about the two
-/// constants provided. This doesn't need to handle simple things like
-/// ConstantFP comparisons, but should instead handle ConstantExprs.
-/// If we can determine that the two constants have a particular relation to
-/// each other, we should return the corresponding FCmpInst predicate,
-/// otherwise return FCmpInst::BAD_FCMP_PREDICATE. This is used below in
-/// ConstantFoldCompareInstruction.
-///
-/// To simplify this code we canonicalize the relation so that the first
-/// operand is always the most "complex" of the two.  We consider ConstantFP
-/// to be the simplest, and ConstantExprs to be the most complex.
-static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) {
-  assert(V1->getType() == V2->getType() &&
-         "Cannot compare values of different types!");
-
-  // We do not know if a constant expression will evaluate to a number or NaN.
-  // Therefore, we can only say that the relation is unordered or equal.
-  if (V1 == V2) return FCmpInst::FCMP_UEQ;
-
-  if (!isa<ConstantExpr>(V1)) {
-    if (!isa<ConstantExpr>(V2)) {
-      // Simple case, use the standard constant folder.
-      ConstantInt *R = nullptr;
-      R = dyn_cast<ConstantInt>(
-                      ConstantExpr::getFCmp(FCmpInst::FCMP_OEQ, V1, V2));
-      if (R && !R->isZero())
-        return FCmpInst::FCMP_OEQ;
-      R = dyn_cast<ConstantInt>(
-                      ConstantExpr::getFCmp(FCmpInst::FCMP_OLT, V1, V2));
-      if (R && !R->isZero())
-        return FCmpInst::FCMP_OLT;
-      R = dyn_cast<ConstantInt>(
-                      ConstantExpr::getFCmp(FCmpInst::FCMP_OGT, V1, V2));
-      if (R && !R->isZero())
-        return FCmpInst::FCMP_OGT;
-
-      // Nothing more we can do
-      return FCmpInst::BAD_FCMP_PREDICATE;
-    }
-
-    // If the first operand is simple and second is ConstantExpr, swap operands.
-    FCmpInst::Predicate SwappedRelation = evaluateFCmpRelation(V2, V1);
-    if (SwappedRelation != FCmpInst::BAD_FCMP_PREDICATE)
-      return FCmpInst::getSwappedPredicate(SwappedRelation);
-  } else {
-    // Ok, the LHS is known to be a constantexpr.  The RHS can be any of a
-    // constantexpr or a simple constant.
-    ConstantExpr *CE1 = cast<ConstantExpr>(V1);
-    switch (CE1->getOpcode()) {
-    case Instruction::FPTrunc:
-    case Instruction::FPExt:
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-      // We might be able to do something with these but we don't right now.
-      break;
-    default:
-      break;
-    }
-  }
-  // There are MANY other foldings that we could perform here.  They will
-  // probably be added on demand, as they seem needed.
-  return FCmpInst::BAD_FCMP_PREDICATE;
-}
-
 static ICmpInst::Predicate areGlobalsPotentiallyEqual(const GlobalValue *GV1,
                                                       const GlobalValue *GV2) {
   auto isGlobalUnsafeForEquality = [](const GlobalValue *GV) {
@@ -1511,79 +1447,14 @@ Constant *llvm::ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
     return ConstantVector::get(ResElts);
   }
 
-  if (C1->getType()->isFloatingPointTy() &&
-      // Only call evaluateFCmpRelation if we have a constant expr to avoid
-      // infinite recursive loop
-      (isa<ConstantExpr>(C1) || isa<ConstantExpr>(C2))) {
-    int Result = -1;  // -1 = unknown, 0 = known false, 1 = known true.
-    switch (evaluateFCmpRelation(C1, C2)) {
-    default: llvm_unreachable("Unknown relation!");
-    case FCmpInst::FCMP_UNO:
-    case FCmpInst::FCMP_ORD:
-    case FCmpInst::FCMP_UNE:
-    case FCmpInst::FCMP_ULT:
-    case FCmpInst::FCMP_UGT:
-    case FCmpInst::FCMP_ULE:
-    case FCmpInst::FCMP_UGE:
-    case FCmpInst::FCMP_TRUE:
-    case FCmpInst::FCMP_FALSE:
-    case FCmpInst::BAD_FCMP_PREDICATE:
-      break; // Couldn't determine anything about these constants.
-    case FCmpInst::FCMP_OEQ: // We know that C1 == C2
-      Result =
-          (Predicate == FCmpInst::FCMP_UEQ || Predicate == FCmpInst::FCMP_OEQ ||
-           Predicate == FCmpInst::FCMP_ULE || Predicate == FCmpInst::FCMP_OLE ||
-           Predicate == FCmpInst::FCMP_UGE || Predicate == FCmpInst::FCMP_OGE);
-      break;
-    case FCmpInst::FCMP_OLT: // We know that C1 < C2
-      Result =
-          (Predicate == FCmpInst::FCMP_UNE || Predicate == FCmpInst::FCMP_ONE ||
-           Predicate == FCmpInst::FCMP_ULT || Predicate == FCmpInst::FCMP_OLT ||
-           Predicate == FCmpInst::FCMP_ULE || Predicate == FCmpInst::FCMP_OLE);
-      break;
-    case FCmpInst::FCMP_OGT: // We know that C1 > C2
-      Result =
-          (Predicate == FCmpInst::FCMP_UNE || Predicate == FCmpInst::FCMP_ONE ||
-           Predicate == FCmpInst::FCMP_UGT || Predicate == FCmpInst::FCMP_OGT ||
-           Predicate == FCmpInst::FCMP_UGE || Predicate == FCmpInst::FCMP_OGE);
-      break;
-    case FCmpInst::FCMP_OLE: // We know that C1 <= C2
-      // We can only partially decide this relation.
-      if (Predicate == FCmpInst::FCMP_UGT || Predicate == FCmpInst::FCMP_OGT)
-        Result = 0;
-      else if (Predicate == FCmpInst::FCMP_ULT ||
-               Predicate == FCmpInst::FCMP_OLT)
-        Result = 1;
-      break;
-    case FCmpInst::FCMP_OGE: // We known that C1 >= C2
-      // We can only partially decide this relation.
-      if (Predicate == FCmpInst::FCMP_ULT || Predicate == FCmpInst::FCMP_OLT)
-        Result = 0;
-      else if (Predicate == FCmpInst::FCMP_UGT ||
-               Predicate == FCmpInst::FCMP_OGT)
-        Result = 1;
-      break;
-    case FCmpInst::FCMP_ONE: // We know that C1 != C2
-      // We can only partially decide this relation.
-      if (Predicate == FCmpInst::FCMP_OEQ || Predicate == FCmpInst::FCMP_UEQ)
-        Result = 0;
-      else if (Predicate == FCmpInst::FCMP_ONE ||
-               Predicate == FCmpInst::FCMP_UNE)
-        Result = 1;
-      break;
-    case FCmpInst::FCMP_UEQ: // We know that C1 == C2 || isUnordered(C1, C2).
-      // We can only partially decide this relation.
+  if (C1->getType()->isFPOrFPVectorTy()) {
+    if (C1 == C2) {
+      // We know that C1 == C2 || isUnordered(C1, C2).
       if (Predicate == FCmpInst::FCMP_ONE)
-        Result = 0;
+        return ConstantInt::getFalse(ResultTy);
       else if (Predicate == FCmpInst::FCMP_UEQ)
-        Result = 1;
-      break;
+        return ConstantInt::getTrue(ResultTy);
     }
-
-    // If we evaluated the result, return it now.
-    if (Result != -1)
-      return ConstantInt::get(ResultTy, Result);
-
   } else {
     // Evaluate the relation between the two constants, per the predicate.
     int Result = -1;  // -1 = unknown, 0 = known false, 1 = known true.

>From c1a9eabf56db21392587f76b4abaa30a01f3643a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 12:14:38 +0100
Subject: [PATCH 11/76] [ConstantFold] Don't mention irrelevant opcodes (NFCI)

This is folding an icmp, so fptrunc, fpext, uitofp, sitofp cannot
appear here at all. Also drop the explicit checks for trunc, fptoui,
fptosi, which can appear, but are just some random subset of opcodes
that are not handled by this code.
---
 llvm/lib/IR/ConstantFold.cpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 151bd0c7086aa68..1d7785760214f77 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1243,22 +1243,13 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
     Constant *CE1Op0 = CE1->getOperand(0);
 
     switch (CE1->getOpcode()) {
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::FPExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-      break; // We can't evaluate floating point casts or truncations.
-
     case Instruction::BitCast:
       // If this is a global value cast, check to see if the RHS is also a
       // GlobalValue.
       if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0))
         if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2))
           return areGlobalsPotentiallyEqual(GV, GV2);
-      [[fallthrough]];
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
+
       // We can't evaluate floating point casts or truncations.
       if (CE1Op0->getType()->isFPOrFPVectorTy())
         break;

>From fe206578f2e9b864ec6a703974a0e2a9001bb11b Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 12:20:07 +0100
Subject: [PATCH 12/76] [ConstantFold] Remove handling for comparison of
 bitcasted global (NFCI)

The bitcast will be folded away in this case, no need to handle it
explicitly.
---
 llvm/lib/IR/ConstantFold.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 1d7785760214f77..a10796e6da99dca 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1244,12 +1244,6 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
 
     switch (CE1->getOpcode()) {
     case Instruction::BitCast:
-      // If this is a global value cast, check to see if the RHS is also a
-      // GlobalValue.
-      if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0))
-        if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2))
-          return areGlobalsPotentiallyEqual(GV, GV2);
-
       // We can't evaluate floating point casts or truncations.
       if (CE1Op0->getType()->isFPOrFPVectorTy())
         break;

>From a7176f8a25fc6930ee7fd0cfcde4d9a96010a5a8 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 12:22:59 +0100
Subject: [PATCH 13/76] [ConstantFold] Remove handling for icmp of bitcast

This only handles the case where the bitcast result is an integer
or pointer, and the input is not FP. This means that the input
can only be a vector. However, converting a comparison of the
whole vector into an element-wise comparison is generally not
correct.

I assume that this code was originally intended to handle the case
where a pointer bitcast is compared to a null pointer, which is
no longer relevant with opaque pointers.

Given the complete lack of test coverage, and the risk of
miscompiles if this code actually did something, I'm opting to
remove it entirely.
---
 llvm/lib/IR/ConstantFold.cpp | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index a10796e6da99dca..a4df579406538a4 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1243,20 +1243,6 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
     Constant *CE1Op0 = CE1->getOperand(0);
 
     switch (CE1->getOpcode()) {
-    case Instruction::BitCast:
-      // We can't evaluate floating point casts or truncations.
-      if (CE1Op0->getType()->isFPOrFPVectorTy())
-        break;
-
-      // If the cast is not actually changing bits, and the second operand is a
-      // null pointer, do the comparison with the pre-casted value.
-      if (V2->isNullValue() && CE1->getType()->isIntOrPtrTy()) {
-        return evaluateICmpRelation(CE1Op0,
-                                    Constant::getNullValue(CE1Op0->getType()),
-                                    isSigned);
-      }
-      break;
-
     case Instruction::GetElementPtr: {
       GEPOperator *CE1GEP = cast<GEPOperator>(CE1);
       // Ok, since this is a getelementptr, we know that the constant has a

>From 1111ef025762d9b7ecc3cafc576083987ae63fe6 Mon Sep 17 00:00:00 2001
From: Neale Ferguson <neale at sinenomine.net>
Date: Fri, 3 Nov 2023 07:42:55 -0400
Subject: [PATCH 14/76] Add openmp support to System z (#66081)

* openmp/README.rst
  - Add s390x to those platforms supported

* openmp/libomptarget/plugins-nextgen/CMakeLists.txt
  - Add s390x subdirectory

* openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
  - Add s390x definitions

* openmp/runtime/CMakeLists.txt
  - Add s390x to those platforms supported

* openmp/runtime/cmake/LibompGetArchitecture.cmake
  - Define s390x ARCHITECTURE

* openmp/runtime/cmake/LibompMicroTests.cmake
  - Add dependencies for System z (aka s390x)

* openmp/runtime/cmake/LibompUtils.cmake
  - Add S390X to the mix

* openmp/runtime/cmake/config-ix.cmake
  - Add s390x as a supported LIPOMP_ARCH

* openmp/runtime/src/kmp_affinity.h
  - Define __NR_sched_[get|set]addinity for s390x

* openmp/runtime/src/kmp_config.h.cmake
  - Define CACHE_LINE for s390x

* openmp/runtime/src/kmp_os.h
  - Add KMP_ARCH_S390X to support checks

* openmp/runtime/src/kmp_platform.h
  - Define KMP_ARCH_S390X

* openmp/runtime/src/kmp_runtime.cpp
  - Generate code when KMP_ARCH_S390X is defined

* openmp/runtime/src/kmp_tasking.cpp
  - Generate code when KMP_ARCH_S390X is defined

* openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
  - Define ITT_ARCH_S390X

* openmp/runtime/src/z_Linux_asm.S
  - Instantiate __kmp_invoke_microtask for s390x

* openmp/runtime/src/z_Linux_util.cpp
  - Generate code when KMP_ARCH_S390X is defined

* openmp/runtime/test/ompt/callback.h
  - Define print_possible_return_addresses for s390x

* openmp/runtime/tools/lib/Platform.pm
  - Return s390x as platform and host architecture

* openmp/runtime/tools/lib/Uname.pm
  - Set hardware platform value for s390x
---
 openmp/README.rst                             |   4 +-
 .../plugins-nextgen/CMakeLists.txt            |   1 +
 .../plugins-nextgen/s390x/CMakeLists.txt      |  17 ++
 openmp/runtime/CMakeLists.txt                 |   9 +-
 .../runtime/cmake/LibompGetArchitecture.cmake |   2 +
 openmp/runtime/cmake/LibompMicroTests.cmake   |   3 +
 openmp/runtime/cmake/LibompUtils.cmake        |   2 +
 openmp/runtime/cmake/config-ix.cmake          |   3 +-
 openmp/runtime/src/kmp_affinity.cpp           |  33 ++++
 openmp/runtime/src/kmp_affinity.h             |  11 ++
 openmp/runtime/src/kmp_config.h.cmake         |   2 +
 openmp/runtime/src/kmp_os.h                   |   6 +-
 openmp/runtime/src/kmp_platform.h             |   7 +-
 openmp/runtime/src/kmp_runtime.cpp            |   3 +-
 openmp/runtime/src/kmp_tasking.cpp            |  10 +-
 .../thirdparty/ittnotify/ittnotify_config.h   |   6 +
 openmp/runtime/src/z_Linux_asm.S              | 159 +++++++++++++++++-
 openmp/runtime/src/z_Linux_util.cpp           |   2 +-
 openmp/runtime/test/lit.cfg                   |   2 +
 openmp/runtime/test/ompt/callback.h           |  16 ++
 openmp/runtime/tools/lib/Platform.pm          |   6 +-
 openmp/runtime/tools/lib/Uname.pm             |   2 +
 22 files changed, 290 insertions(+), 16 deletions(-)
 create mode 100644 openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt

diff --git a/openmp/README.rst b/openmp/README.rst
index bb9443df56d7656..0e4916f44c68287 100644
--- a/openmp/README.rst
+++ b/openmp/README.rst
@@ -141,7 +141,7 @@ Options for all Libraries
 Options for ``libomp``
 ----------------------
 
-**LIBOMP_ARCH** = ``aarch64|arm|i386|loongarch64|mic|mips|mips64|ppc64|ppc64le|x86_64|riscv64``
+**LIBOMP_ARCH** = ``aarch64|arm|i386|loongarch64|mic|mips|mips64|ppc64|ppc64le|x86_64|riscv64|s390x``
   The default value for this option is chosen based on probing the compiler for
   architecture macros (e.g., is ``__x86_64__`` predefined by compiler?).
 
@@ -198,7 +198,7 @@ Optional Features
 **LIBOMP_OMPT_SUPPORT** = ``ON|OFF``
   Include support for the OpenMP Tools Interface (OMPT).
   This option is supported and ``ON`` by default for x86, x86_64, AArch64,
-  PPC64, RISCV64 and LoongArch64 on Linux* and macOS*.
+  PPC64, RISCV64, LoongArch64, and s390x on Linux* and macOS*.
   This option is ``OFF`` if this feature is not supported for the platform.
 
 **LIBOMP_OMPT_OPTIONAL** = ``ON|OFF``
diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
index 9b4f4a5866e7987..d81e5d37d7c08df 100644
--- a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -96,6 +96,7 @@ add_subdirectory(cuda)
 add_subdirectory(ppc64)
 add_subdirectory(ppc64le)
 add_subdirectory(x86_64)
+add_subdirectory(s390x)
 
 # Make sure the parent scope can see the plugins that will be created.
 set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
diff --git a/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
new file mode 100644
index 000000000000000..1b12a292899980e
--- /dev/null
+++ b/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a s390x machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ build_generic_elf64("SystemZ" "S390X" "s390x" "s390x-ibm-linux-gnu" "22")
+else()
+ libomptarget_say("Not building s390x NextGen offloading plugin: machine not found in the system.")
+endif()
diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt
index 4441c4babdc07c0..8a913989272c4c5 100644
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -30,7 +30,7 @@ if(${OPENMP_STANDALONE_BUILD})
   # If adding a new architecture, take a look at cmake/LibompGetArchitecture.cmake
   libomp_get_architecture(LIBOMP_DETECTED_ARCH)
   set(LIBOMP_ARCH ${LIBOMP_DETECTED_ARCH} CACHE STRING
-    "The architecture to build for (x86_64/i386/arm/ppc64/ppc64le/aarch64/mic/mips/mips64/riscv64/loongarch64/ve).")
+    "The architecture to build for (x86_64/i386/arm/ppc64/ppc64le/aarch64/mic/mips/mips64/riscv64/loongarch64/ve/s390x).")
   # Should assertions be enabled?  They are on by default.
   set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL
     "enable assertions?")
@@ -65,6 +65,8 @@ else() # Part of LLVM build
     set(LIBOMP_ARCH loongarch64)
   elseif(LIBOMP_NATIVE_ARCH MATCHES "ve")
     set(LIBOMP_ARCH ve)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "s390x")
+    set(LIBOMP_ARCH s390x)
   else()
     # last ditch effort
     libomp_get_architecture(LIBOMP_ARCH)
@@ -85,7 +87,7 @@ if(LIBOMP_ARCH STREQUAL "aarch64")
   endif()
 endif()
 
-libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 aarch64_a64fx mic mips mips64 riscv64 loongarch64 ve)
+libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 aarch64_a64fx mic mips mips64 riscv64 loongarch64 ve s390x)
 
 set(LIBOMP_LIB_TYPE normal CACHE STRING
   "Performance,Profiling,Stubs library (normal/profile/stubs)")
@@ -165,6 +167,7 @@ set(MIPS FALSE)
 set(RISCV64 FALSE)
 set(LOONGARCH64 FALSE)
 set(VE FALSE)
+set(S390X FALSE)
 if("${LIBOMP_ARCH}" STREQUAL "i386" OR "${LIBOMP_ARCH}" STREQUAL "32")    # IA-32 architecture
   set(IA32 TRUE)
 elseif("${LIBOMP_ARCH}" STREQUAL "x86_64" OR "${LIBOMP_ARCH}" STREQUAL "32e") # Intel(R) 64 architecture
@@ -193,6 +196,8 @@ elseif("${LIBOMP_ARCH}" STREQUAL "loongarch64") # LoongArch64 architecture
     set(LOONGARCH64 TRUE)
 elseif("${LIBOMP_ARCH}" STREQUAL "ve") # VE architecture
     set(VE TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "s390x") # S390x (Z) architecture
+    set(S390X TRUE)
 endif()
 
 # Set some flags based on build_type
diff --git a/openmp/runtime/cmake/LibompGetArchitecture.cmake b/openmp/runtime/cmake/LibompGetArchitecture.cmake
index 98bfce9ae990a7b..9d4f08b92de50d5 100644
--- a/openmp/runtime/cmake/LibompGetArchitecture.cmake
+++ b/openmp/runtime/cmake/LibompGetArchitecture.cmake
@@ -51,6 +51,8 @@ function(libomp_get_architecture return_arch)
       #error ARCHITECTURE=loongarch64
     #elif defined(__ve__)
       #error ARCHITECTURE=ve
+    #elif defined(__s390x__)
+      #error ARCHITECTURE=s390x
     #else
       #error ARCHITECTURE=UnknownArchitecture
     #endif
diff --git a/openmp/runtime/cmake/LibompMicroTests.cmake b/openmp/runtime/cmake/LibompMicroTests.cmake
index 88deb461dbaf3a2..e8cc218af0c294f 100644
--- a/openmp/runtime/cmake/LibompMicroTests.cmake
+++ b/openmp/runtime/cmake/LibompMicroTests.cmake
@@ -217,6 +217,9 @@ else()
     elseif(${LOONGARCH64})
       libomp_append(libomp_expected_library_deps libc.so.6)
       libomp_append(libomp_expected_library_deps ld.so.1)
+    elseif(${S390X})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps ld.so.1)
     endif()
     libomp_append(libomp_expected_library_deps libpthread.so.0 IF_FALSE STUBS_LIBRARY)
     libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
diff --git a/openmp/runtime/cmake/LibompUtils.cmake b/openmp/runtime/cmake/LibompUtils.cmake
index 0151ca0ea826bd7..139eabb45c54f74 100644
--- a/openmp/runtime/cmake/LibompUtils.cmake
+++ b/openmp/runtime/cmake/LibompUtils.cmake
@@ -113,6 +113,8 @@ function(libomp_get_legal_arch return_arch_string)
     set(${return_arch_string} "LOONGARCH64" PARENT_SCOPE)
   elseif(${VE})
     set(${return_arch_string} "VE" PARENT_SCOPE)
+  elseif(${S390X})
+    set(${return_arch_string} "S390X" PARENT_SCOPE)
   else()
     set(${return_arch_string} "${LIBOMP_ARCH}" PARENT_SCOPE)
     libomp_warning_say("libomp_get_legal_arch(): Warning: Unknown architecture: Using ${LIBOMP_ARCH}")
diff --git a/openmp/runtime/cmake/config-ix.cmake b/openmp/runtime/cmake/config-ix.cmake
index 9869aeab0354635..d54d350816926df 100644
--- a/openmp/runtime/cmake/config-ix.cmake
+++ b/openmp/runtime/cmake/config-ix.cmake
@@ -325,7 +325,8 @@ else()
       (LIBOMP_ARCH STREQUAL ppc64le) OR
       (LIBOMP_ARCH STREQUAL ppc64) OR
       (LIBOMP_ARCH STREQUAL riscv64) OR
-      (LIBOMP_ARCH STREQUAL loongarch64))
+      (LIBOMP_ARCH STREQUAL loongarch64) OR
+      (LIBOMP_ARCH STREQUAL s390x))
      AND # OS supported?
      ((WIN32 AND LIBOMP_HAVE_PSAPI) OR APPLE OR (NOT WIN32 AND LIBOMP_HAVE_WEAK_ATTRIBUTE)))
     set(LIBOMP_HAVE_OMPT_SUPPORT TRUE)
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 20c1c610b9159e0..8c608d78bb56fe1 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -2990,6 +2990,9 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
 
   unsigned num_avail = 0;
   *line = 0;
+#if KMP_ARCH_S390X
+  bool reading_s390x_sys_info = true;
+#endif
   while (!feof(f)) {
     // Create an inner scoping level, so that all the goto targets at the end of
     // the loop appear in an outer scoping level. This avoids warnings about
@@ -3035,8 +3038,21 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
       if (*buf == '\n' && *line == 2)
         continue;
 #endif
+#if KMP_ARCH_S390X
+      // s390x /proc/cpuinfo starts with a variable number of lines containing
+      // the overall system information. Skip them.
+      if (reading_s390x_sys_info) {
+        if (*buf == '\n')
+          reading_s390x_sys_info = false;
+        continue;
+      }
+#endif
 
+#if KMP_ARCH_S390X
+      char s1[] = "cpu number";
+#else
       char s1[] = "processor";
+#endif
       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
         CHECK_LINE;
         char *p = strchr(buf + sizeof(s1) - 1, ':');
@@ -3062,6 +3078,23 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
             threadInfo[num_avail][osIdIndex]);
         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
 
+#if KMP_ARCH_S390X
+        // Disambiguate physical_package_id.
+        unsigned book_id;
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/book_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &book_id);
+        threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
+
+        unsigned drawer_id;
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &drawer_id);
+        threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
+#endif
+
         KMP_SNPRINTF(path, sizeof(path),
                      "/sys/devices/system/cpu/cpu%u/topology/core_id",
                      threadInfo[num_avail][osIdIndex]);
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index 97808b528538097..5464259784e2ba3 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -297,6 +297,17 @@ class KMPHwlocAffinity : public KMPAffinity {
 #elif __NR_sched_getaffinity != 204
 #error Wrong code for getaffinity system call.
 #endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_S390X
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 239
+#elif __NR_sched_setaffinity != 239
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 240
+#elif __NR_sched_getaffinity != 240
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
 #else
 #error Unknown or unsupported architecture
 #endif /* KMP_ARCH_* */
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
index 58bf64112b1a7a7..5f04301c91c60cd 100644
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -104,6 +104,8 @@
 # define CACHE_LINE 128
 #elif KMP_ARCH_AARCH64_A64FX
 # define CACHE_LINE 256
+#elif KMP_ARCH_S390X
+# define CACHE_LINE 256
 #else
 # define CACHE_LINE 64
 #endif
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index 2c632112a8d8e35..ca694f6f14933cd 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -178,7 +178,8 @@ typedef unsigned long long kmp_uint64;
 #if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS
 #define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
 #elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                 \
-    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X
 #define KMP_SIZE_T_SPEC KMP_UINT64_SPEC
 #else
 #error "Can't determine size_t printf format specifier."
@@ -1043,7 +1044,8 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
 #endif /* KMP_OS_WINDOWS */
 
 #if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS ||     \
-    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X
 #if KMP_OS_WINDOWS
 #undef KMP_MB
 #define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst)
diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h
index 9640d346a916a9f..b7972c7248dd57d 100644
--- a/openmp/runtime/src/kmp_platform.h
+++ b/openmp/runtime/src/kmp_platform.h
@@ -101,6 +101,7 @@
 #define KMP_ARCH_RISCV64 0
 #define KMP_ARCH_LOONGARCH64 0
 #define KMP_ARCH_VE 0
+#define KMP_ARCH_S390X 0
 
 #if KMP_OS_WINDOWS
 #if defined(_M_AMD64) || defined(__x86_64)
@@ -153,6 +154,9 @@
 #elif defined __ve__
 #undef KMP_ARCH_VE
 #define KMP_ARCH_VE 1
+#elif defined __s390x__
+#undef KMP_ARCH_S390X
+#define KMP_ARCH_S390X 1
 #endif
 #endif
 
@@ -217,7 +221,8 @@
 // TODO: Fixme - This is clever, but really fugly
 #if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 +     \
               KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 +             \
-              KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE)
+              KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE +          \
+              KMP_ARCH_S390X)
 #error Unknown or unsupported architecture
 #endif
 
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index bd5b41e0aa9922f..25136691bc72de9 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -8894,7 +8894,8 @@ __kmp_determine_reduction_method(
     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
 
 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
-    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
+    KMP_ARCH_VE || KMP_ARCH_S390X
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||        \
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index e8eb6b02650377c..f90ae9cabab79fa 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -1554,7 +1554,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   task = KMP_TASKDATA_TO_TASK(taskdata);
 
 // Make sure task & taskdata are aligned appropriately
-#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
+#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
   KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
   KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
 #else
@@ -1737,8 +1737,12 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
 // gtid: global thread ID of caller
 // task: the task to invoke
 // current_task: the task to resume after task invocation
-static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
-                              kmp_taskdata_t *current_task) {
+#ifdef __s390x__
+__attribute__((target("backchain")))
+#endif
+static void
+__kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
+                  kmp_taskdata_t *current_task) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   kmp_info_t *thread;
   int discard = 0 /* false */;
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
index ff37eb4ed175e67..bd3fd9b43e574d1 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
@@ -166,6 +166,10 @@
 #define ITT_ARCH_VE 8
 #endif /* ITT_ARCH_VE */
 
+#ifndef ITT_ARCH_S390X
+#define ITT_ARCH_S390X 8
+#endif /* ITT_ARCH_S390X */
+
 #ifndef ITT_ARCH
 #if defined _M_IX86 || defined __i386__
 #define ITT_ARCH ITT_ARCH_IA32
@@ -181,6 +185,8 @@
 #define ITT_ARCH ITT_ARCH_PPC64
 #elif defined __ve__
 #define ITT_ARCH ITT_ARCH_VE
+#elif defined __s390x__
+#define ITT_ARCH ITT_ARCH_S390X
 #endif
 #endif
 
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index 2c0df6e3b08505a..a72705528d4162e 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -2252,6 +2252,159 @@ __kmp_invoke_microtask:
 
 #endif /* KMP_ARCH_VE */
 
+#if KMP_ARCH_S390X
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+//                            void *p_argv[]
+// #if OMPT_SUPPORT
+//                            ,
+//                            void **exit_frame_ptr
+// #endif
+//                            ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+//   return 1;
+// }
+//
+// Parameters:
+//   r2: pkfn
+//   r3: gtid
+//   r4: tid
+//   r5: argc
+//   r6: p_argv
+//   SP+160: exit_frame_ptr
+//
+// Locals:
+//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+//   __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp. registers:
+//
+//  r0: used to fetch argv slots
+//  r7: used as temporary for number of remaining pkfn parms
+//  r8: argv
+//  r9: pkfn
+//  r10: stack size
+//  r11: previous fp
+//  r12: stack parameter area
+//  r13: argv slot
+//
+// return: r2 (always 1/TRUE)
+//
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	.globl	__kmp_invoke_microtask
+	.p2align	1
+	.type	__kmp_invoke_microtask, at function
+__kmp_invoke_microtask:
+	.cfi_startproc
+
+	stmg	%r6,%r14,48(%r15)
+        .cfi_offset %r6, -112
+        .cfi_offset %r7, -104
+        .cfi_offset %r8, -96
+        .cfi_offset %r9, -88
+        .cfi_offset %r10, -80
+        .cfi_offset %r11, -72
+        .cfi_offset %r12, -64
+        .cfi_offset %r13, -56
+        .cfi_offset %r14, -48
+        .cfi_offset %r15, -40
+	lgr	%r11,%r15
+	.cfi_def_cfa %r11, 160
+
+	// Compute the dynamic stack size:
+	//
+	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
+	//   reference
+	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
+	//   function by register. Given that we have 5 of such registers (r[2-6])
+	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
+	//   reserve max(0, argc - 3)*8 extra bytes
+	//
+	// The total number of bytes is then max(0, argc - 3)*8 + 8
+
+	lgr	%r10,%r5
+	aghi	%r10,-2
+	jnm	0f
+	lghi	%r10,0
+0:
+	sllg	%r10,%r10,3
+	lgr	%r12,%r10
+	aghi	%r10,176
+	sgr 	%r15,%r10
+	agr	%r12,%r15
+	stg	%r11,0(%r15)
+
+	lgr	%r9,%r2			// pkfn
+
+#if OMPT_SUPPORT
+	// Save frame pointer into exit_frame
+	lg	%r8,160(%r11)
+	stg	%r11,0(%r8)
+#endif
+
+	// Prepare arguments for the pkfn function (first 5 using r2-r6 registers)
+
+	stg     %r3,160(%r12)
+	la	%r2,164(%r12)		// gid
+	stg	%r4,168(%r12)		
+	la	%r3,172(%r12)		// tid
+	lgr	%r8,%r6			// argv
+
+	// If argc > 0
+	ltgr	%r7,%r5
+	jz	1f
+
+	lg	%r4,0(%r8)		// argv[0]
+	aghi	%r7,-1
+	jz	1f
+
+	// If argc > 1
+	lg	%r5,8(%r8)		// argv[1]
+	aghi	%r7,-1
+	jz	1f
+
+	// If argc > 2
+	lg	%r6,16(%r8)		// argv[2]
+	aghi	%r7,-1
+	jz	1f
+
+	lghi	%r13,0			// Index [n]
+2:
+	lg	%r0,24(%r13,%r8)	// argv[2+n]
+	stg	%r0,160(%r13,%r15)	// parm[2+n]
+	aghi	%r13,8			// Next
+	aghi	%r7,-1
+	jnz	2b
+
+1:
+	basr	%r14,%r9		// Call pkfn
+
+	// Restore stack and return
+
+	lgr	%r15,%r11
+	lmg	%r6,%r14,48(%r15)
+	lghi	%r2,1
+	br	%r14
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_S390X */
+
 #if KMP_ARCH_ARM || KMP_ARCH_MIPS
     .data
     COMMON .gomp_critical_user_, 32, 3
@@ -2266,7 +2419,8 @@ __kmp_unnamed_critical_addr:
 #endif /* KMP_ARCH_ARM */
 
 #if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||                   \
-    KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
+    KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE ||                 \
+    KMP_ARCH_S390X
 #ifndef KMP_PREFIX_UNDERSCORE
 # define KMP_PREFIX_UNDERSCORE(x) x
 #endif
@@ -2281,7 +2435,8 @@ KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
     .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
 #endif
 #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
-          KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE */
+          KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE || 
+          KMP_ARCH_S390X */
 
 #if KMP_OS_LINUX
 # if KMP_ARCH_ARM || KMP_ARCH_AARCH64
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 843d23816b03394..e17bed2a4f76614 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -2476,7 +2476,7 @@ int __kmp_get_load_balance(int max) {
 #if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
       ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) ||                 \
       KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||            \
-      KMP_ARCH_ARM || KMP_ARCH_VE)
+      KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X)
 
 // we really only need the case with 1 argument, because CLANG always build
 // a struct of pointers to shared variables referenced in the outlined function
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index 650d3853e851112..27ff057c85f60f2 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -51,6 +51,8 @@ flags = " -I " + config.test_source_root + \
     " " + config.test_extra_flags
 if config.has_omit_frame_pointer_flag:
     flags += " -fno-omit-frame-pointer"
+if config.target_arch == "s390x":
+    flags += " -mbackchain"
 
 config.test_flags = " -I " + config.omp_header_directory + flags
 config.test_flags_use_compiler_omp_h = flags
diff --git a/openmp/runtime/test/ompt/callback.h b/openmp/runtime/test/ompt/callback.h
index c5266e230c26f77..efbd4c716e0ee1e 100644
--- a/openmp/runtime/test/ompt/callback.h
+++ b/openmp/runtime/test/ompt/callback.h
@@ -228,6 +228,22 @@ ompt_label_##id:
   printf("%" PRIu64 ": current_address=%p or %p\n",                            \
          ompt_get_thread_data()->value, ((char *)addr) - 8,                    \
          ((char *)addr) - 8)
+#elif KMP_ARCH_S390X
+// On s390x the NOP instruction is 2 bytes long. For non-void runtime
+// functions Clang inserts a STY instruction (but only if compiling under
+// -fno-PIC which will be the default with Clang 8.0, another 6 bytes).
+//
+// Another possibility is:
+//
+//                brasl %r14,__kmpc_end_master at plt
+//   a7 f4 00 02  j 0f
+//   47 00 00 00  0: nop
+//   a7 f4 00 02  j addr
+//                addr:
+#define print_possible_return_addresses(addr)                                  \
+  printf("%" PRIu64 ": current_address=%p or %p or %p\n",                      \
+         ompt_get_thread_data()->value, ((char *)addr) - 2,                    \
+         ((char *)addr) - 8, ((char *)addr) - 12)
 #else
 #error Unsupported target architecture, cannot determine address offset!
 #endif
diff --git a/openmp/runtime/tools/lib/Platform.pm b/openmp/runtime/tools/lib/Platform.pm
index d62d450e9e5dcf5..6efd932daef561b 100644
--- a/openmp/runtime/tools/lib/Platform.pm
+++ b/openmp/runtime/tools/lib/Platform.pm
@@ -65,6 +65,8 @@ sub canon_arch($) {
             $arch = "riscv64";
         } elsif ( $arch =~ m{\Aloongarch64} ) {
             $arch = "loongarch64";
+        } elsif ( $arch =~ m{\As390x} ) {
+            $arch = "s390x";
         } else {
             $arch = undef;
         }; # if
@@ -230,6 +232,8 @@ sub target_options() {
         $_host_arch = "riscv64";
     } elsif ( $hardware_platform eq "loongarch64" ) {
         $_host_arch = "loongarch64";
+    } elsif ( $hardware_platform eq "s390x" ) {
+        $_host_arch = "s390x";
     } else {
         die "Unsupported host hardware platform: \"$hardware_platform\"; stopped";
     }; # if
@@ -419,7 +423,7 @@ the script assumes host architecture is target one.
 
 Input string is an architecture name to canonize. The function recognizes many variants, for example:
 C<32e>, C<Intel64>, C<Intel(R) 64>, etc. Returned string is a canonized architecture name,
-one of: C<32>, C<32e>, C<64>, C<arm>, C<ppc64le>, C<ppc64>, C<mic>, C<mips>, C<mips64>, C<riscv64>, C<loongarch64> or C<undef> is input string is not recognized.
+one of: C<32>, C<32e>, C<64>, C<arm>, C<ppc64le>, C<ppc64>, C<mic>, C<mips>, C<mips64>, C<riscv64>, C<loongarch64>, C<s390x>, or C<undef> is input string is not recognized.
 
 =item B<legal_arch( $arch )>
 
diff --git a/openmp/runtime/tools/lib/Uname.pm b/openmp/runtime/tools/lib/Uname.pm
index 8a976addcff03e0..9dde444d56a4ece 100644
--- a/openmp/runtime/tools/lib/Uname.pm
+++ b/openmp/runtime/tools/lib/Uname.pm
@@ -160,6 +160,8 @@ if ( 0 ) {
         $values{ hardware_platform } = "riscv64";
     } elsif ( $values{ machine } =~ m{\Aloongarch64\z} ) {
         $values{ hardware_platform } = "loongarch64";
+    } elsif ( $values{ machine } =~ m{\As390x\z} ) {
+        $values{ hardware_platform } = "s390x";
     } else {
         die "Unsupported machine (\"$values{ machine }\") returned by POSIX::uname(); stopped";
     }; # if

>From 9b3bb7a066c407f908cb1896abfa4fb4a8ea6588 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 3 Nov 2023 11:45:08 +0000
Subject: [PATCH 15/76] [AArch64] Implement reinterpret builtins for SVE vector
 tuples (#69598)

This patch adds reinterpret builtins as proposed here:
https://github.com/ARM-software/acle/pull/275.

The builtins take the form:

    sv<dst>x<N>_t svreinterpret_<dst>_<src>_x<N>(sv<src>x<N>_t op)

where
- <src> and <dst> designate the source and the destination type,
respectively, all pairs chosen from {s8, u8, s16, u8, s32, u32, s64,
u64, bf16, f16, f32, f64}
  - <N> designated the number of tuple elements, 2, 3 or 4

A short (overloaded) for is also provided, where the destination type is
explicitly designated and the source type is deduced from the parameter
type. These take the form

    sv<dst>x<N>_t svreinterpret_<dst>(sv<src>x<N>_t op)

For example:

    svuin16x2_t svreinterpret_u16_s32_x2(svint32x2_t op);
    svuin16x2_t svreinterpret_u16(svint32x2_t op);
---
 clang/lib/CodeGen/CGBuiltin.cpp               |    2 +-
 .../acle_sve_reinterpret-bfloat.c             |  807 +++-
 .../acle_sve_reinterpret.c                    | 4035 ++++++++++++++++-
 clang/utils/TableGen/SveEmitter.cpp           |  111 +-
 4 files changed, 4617 insertions(+), 338 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 280a2e1f1ee2c77..972aa1c708e5f65 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9994,7 +9994,7 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
                                                   const CallExpr *E) {
   llvm::Type *Ty = ConvertType(E->getType());
   if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
-      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
+      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64_x4) {
     Value *Val = EmitScalarExpr(E->getArg(0));
     return EmitSVEReinterpret(Val, Ty);
   }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
index 8439ec175f96f2a..75d8feb8a847c3b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
@@ -1,19 +1,44 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
 
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#ifdef TUPLE
+#define TYPE_1(base,tuple) base ## tuple ## _t
+#define TYPE_0(base,tuple) TYPE_1(base,tuple)
+#define TYPE(base) TYPE_0(base,TUPLE)
+#else
+#define TYPE(base) base ## _t
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
 #else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#ifdef TUPLE
+#define SVE_ACLE_FUNC_1(A1,A2,T) A1##A2##_##T
+#define SVE_ACLE_FUNC_0(A1,A2,T) SVE_ACLE_FUNC_1(A1,A2,T)
+#define SVE_ACLE_FUNC(A1,A2) SVE_ACLE_FUNC_0(A1,A2,TUPLE)
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
 #endif
 
 // CHECK-LABEL: @test_svreinterpret_s8_bf16(
@@ -21,13 +46,43 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s8_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_s8, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s8_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s8_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s8_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_s8, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_bf16(
@@ -35,13 +90,43 @@ svint8_t test_svreinterpret_s8_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_s16_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_s16, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_s16_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_s16_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_s16_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_s16, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_bf16(
@@ -49,26 +134,86 @@ svint16_t test_svreinterpret_s16_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_s32_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_s32, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_s32_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_s32_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_s32_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_s32, _bf16)(op);
 }
 // CHECK-LABEL: @test_svreinterpret_s64_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_s64_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_s64, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_s64_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_s64_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_s64_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_s64, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_bf16(
@@ -76,13 +221,43 @@ svint64_t test_svreinterpret_s64_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u8_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_u8, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u8_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u8_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u8_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_u8, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_bf16(
@@ -90,13 +265,43 @@ svuint8_t test_svreinterpret_u8_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_u16_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_u16, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_u16_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_u16_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_u16_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_u16, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_bf16(
@@ -104,13 +309,43 @@ svuint16_t test_svreinterpret_u16_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_u32_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_u32, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_u32_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_u32_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_u32_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_u32, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_bf16(
@@ -118,13 +353,43 @@ svuint32_t test_svreinterpret_u32_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_u64_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_u64, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_u64_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_u64_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_u64_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_u64, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_s8(
@@ -132,13 +397,43 @@ svuint64_t test_svreinterpret_u64_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_bf16_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_s8(svint8_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _s8, , )(op);
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_bf16_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_bf16_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_bf16_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_s16(
@@ -146,13 +441,43 @@ svbfloat16_t test_svreinterpret_bf16_s8(svint8_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_s16(svint16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _s16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_s32(
@@ -160,13 +485,43 @@ svbfloat16_t test_svreinterpret_bf16_s16(svint16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_s32(svint32_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _s32, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_s64(
@@ -174,13 +529,43 @@ svbfloat16_t test_svreinterpret_bf16_s32(svint32_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_s64(svint64_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _s64, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_u8(
@@ -188,13 +573,43 @@ svbfloat16_t test_svreinterpret_bf16_s64(svint64_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_bf16_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_u8(svuint8_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _u8, , )(op);
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_bf16_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_bf16_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_bf16_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_u16(
@@ -202,13 +617,43 @@ svbfloat16_t test_svreinterpret_bf16_u8(svuint8_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_u16(svuint16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _u16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_u32(
@@ -216,13 +661,43 @@ svbfloat16_t test_svreinterpret_bf16_u16(svuint16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_u32(svuint32_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _u32, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_u64(
@@ -230,25 +705,79 @@ svbfloat16_t test_svreinterpret_bf16_u32(svuint32_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_u64(svuint64_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _u64, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z28test_svreinterpret_bf16_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[OP:%.*]]
 //
-svbfloat16_t test_svreinterpret_bf16_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z28test_svreinterpret_bf16_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z28test_svreinterpret_bf16_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z28test_svreinterpret_bf16_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[OP:%.*]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_f16(
@@ -256,13 +785,43 @@ svbfloat16_t test_svreinterpret_bf16_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_f16(svfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _f16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_f32(
@@ -270,13 +829,43 @@ svbfloat16_t test_svreinterpret_bf16_f16(svfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_f32(svfloat32_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _f32, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_bf16_f64(
@@ -284,13 +873,43 @@ svbfloat16_t test_svreinterpret_bf16_f32(svfloat32_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_bf16_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 16 x bfloat>
+// TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_bf16_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 24 x bfloat>
+// TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_bf16_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x bfloat>
+// TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_bf16_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x bfloat>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svreinterpret_bf16_f64(svfloat64_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_bf16, _f64, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_bf16_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 16 x bfloat>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_bf16_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 24 x bfloat>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_bf16_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x bfloat>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) {
+  return SVE_ACLE_FUNC(svreinterpret_bf16, _f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_bf16(
@@ -298,13 +917,43 @@ svbfloat16_t test_svreinterpret_bf16_f64(svfloat64_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_f32_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_f32, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_f32_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_f32_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_f32_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_f32, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_bf16(
@@ -312,13 +961,43 @@ svfloat32_t test_svreinterpret_f32_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_f16_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_f16, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_f16_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_f16_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_f16_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_f16, _bf16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_bf16(
@@ -326,11 +1005,41 @@ svfloat16_t test_svreinterpret_f16_bf16(svbfloat16_t op) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_bf16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_bf16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_bf16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z27test_svreinterpret_f64_bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x bfloat> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_bf16(svbfloat16_t op) {
-  return SVE_ACLE_FUNC(svreinterpret_f64, _bf16, , )(op);
+// CPP-TUPLE2-LABEL: @_Z27test_svreinterpret_f64_bf1614svbfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x bfloat> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z27test_svreinterpret_f64_bf1614svbfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x bfloat> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z27test_svreinterpret_f64_bf1614svbfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_bf16(TYPE(svbfloat16) op) {
+  return SVE_ACLE_FUNC(svreinterpret_f64, _bf16)(op);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
index c4fe461c709b5f2..24167a8f5ce108c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
@@ -1,31 +1,80 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
+// RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
+// RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x2 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#ifdef TUPLE
+#define TYPE_1(base,tuple) base ## tuple ## _t
+#define TYPE_0(base,tuple) TYPE_1(base,tuple)
+#define TYPE(base) TYPE_0(base,TUPLE)
+#else
+#define TYPE(base) base ## _t
+#endif
+
 #ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#ifdef TUPLE
+#define SVE_ACLE_FUNC_1(A1,A2,T) A1##A2##_##T
+#define SVE_ACLE_FUNC_0(A1,A2,T) SVE_ACLE_FUNC_1(A1,A2,T)
+#define SVE_ACLE_FUNC(A1,A2) SVE_ACLE_FUNC_0(A1,A2,TUPLE)
 #else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
 #endif
 
 // CHECK-LABEL: @test_svreinterpret_s8_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z24test_svreinterpret_s8_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[OP:%.*]]
 //
-svint8_t test_svreinterpret_s8_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z24test_svreinterpret_s8_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z24test_svreinterpret_s8_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z24test_svreinterpret_s8_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
+//
+TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s8_s16(
@@ -33,14 +82,44 @@ svint8_t test_svreinterpret_s8_s8(svint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s8_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s8_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s8_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s8_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s8_s32(
@@ -48,14 +127,44 @@ svint8_t test_svreinterpret_s8_s16(svint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s8_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s8_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s8_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s8_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s8_s64(
@@ -63,27 +172,81 @@ svint8_t test_svreinterpret_s8_s32(svint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s8_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s8_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s8_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s8_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s8_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z24test_svreinterpret_s8_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[OP:%.*]]
 //
-svint8_t test_svreinterpret_s8_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z24test_svreinterpret_s8_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z24test_svreinterpret_s8_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z24test_svreinterpret_s8_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
+//
+TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s8_u16(
@@ -91,44 +254,136 @@ svint8_t test_svreinterpret_s8_u8(svuint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s8_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s8_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s8_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s8_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_u16)(op);
 }
 
+//
 // CHECK-LABEL: @test_svreinterpret_s8_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s8_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s8_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s8_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s8_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_u32)(op);
 }
 
+//
 // CHECK-LABEL: @test_svreinterpret_s8_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s8_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s8_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s8_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s8_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s8_f16(
@@ -136,14 +391,44 @@ svint8_t test_svreinterpret_s8_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s8_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s8_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s8_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s8_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s8_f32(
@@ -151,14 +436,44 @@ svint8_t test_svreinterpret_s8_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s8_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s8_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s8_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s8_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s8_f64(
@@ -166,14 +481,44 @@ svint8_t test_svreinterpret_s8_f32(svfloat32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s8_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s8_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s8_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s8_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svreinterpret_s8_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s8_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s8_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s8_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s8,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s8,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_s8(
@@ -181,27 +526,81 @@ svint8_t test_svreinterpret_s8_f64(svfloat64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s16_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s16_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s16_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s16_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s16_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[OP:%.*]]
 //
-svint16_t test_svreinterpret_s16_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s16_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s16_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s16_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
+//
+TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_s32(
@@ -209,14 +608,44 @@ svint16_t test_svreinterpret_s16_s16(svint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s16_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s16_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s16_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s16_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_s64(
@@ -224,14 +653,44 @@ svint16_t test_svreinterpret_s16_s32(svint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s16_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s16_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s16_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s16_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_u8(
@@ -239,27 +698,81 @@ svint16_t test_svreinterpret_s16_s64(svint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s16_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s16_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s16_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s16_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s16_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[OP:%.*]]
 //
-svint16_t test_svreinterpret_s16_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s16_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s16_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s16_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
+//
+TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_u32(
@@ -267,14 +780,44 @@ svint16_t test_svreinterpret_s16_u16(svuint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s16_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s16_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s16_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s16_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_u64(
@@ -282,14 +825,44 @@ svint16_t test_svreinterpret_s16_u32(svuint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s16_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s16_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s16_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s16_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_f16(
@@ -297,14 +870,44 @@ svint16_t test_svreinterpret_s16_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s16_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s16_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s16_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s16_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_f32(
@@ -312,14 +915,44 @@ svint16_t test_svreinterpret_s16_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s16_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s16_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s16_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s16_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s16_f64(
@@ -327,14 +960,44 @@ svint16_t test_svreinterpret_s16_f32(svfloat32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s16_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s16_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s16_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s16_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svreinterpret_s16_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s16_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s16_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s16_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s16,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_s8(
@@ -342,14 +1005,44 @@ svint16_t test_svreinterpret_s16_f64(svfloat64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s32_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s32_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s32_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s32_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_s16(
@@ -357,28 +1050,81 @@ svint32_t test_svreinterpret_s32_s8(svint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s32_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s32_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s32_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s32_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_s16)(op);
 }
 
-//
 // CHECK-LABEL: @test_svreinterpret_s32_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s32_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[OP:%.*]]
 //
-svint32_t test_svreinterpret_s32_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s32_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s32_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s32_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
+//
+TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_s64(
@@ -386,14 +1132,44 @@ svint32_t test_svreinterpret_s32_s32(svint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s32_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s32_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s32_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s32_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_u8(
@@ -401,14 +1177,44 @@ svint32_t test_svreinterpret_s32_s64(svint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s32_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s32_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s32_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s32_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_u16(
@@ -416,27 +1222,81 @@ svint32_t test_svreinterpret_s32_u8(svuint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s32_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s32_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s32_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s32_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s32_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[OP:%.*]]
 //
-svint32_t test_svreinterpret_s32_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s32_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s32_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s32_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
+//
+TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_u64(
@@ -444,14 +1304,44 @@ svint32_t test_svreinterpret_s32_u32(svuint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s32_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s32_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s32_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s32_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_f16(
@@ -459,14 +1349,44 @@ svint32_t test_svreinterpret_s32_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s32_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s32_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s32_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s32_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s32_f32(
@@ -474,29 +1394,90 @@ svint32_t test_svreinterpret_s32_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s32_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s32_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s32_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s32_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_f32)(op);
 }
 
+//
 // CHECK-LABEL: @test_svreinterpret_s32_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s32_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s32_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s32_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s32_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svreinterpret_s32_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s32_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s32_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s32_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s32,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s32,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_s8(
@@ -504,14 +1485,44 @@ svint32_t test_svreinterpret_s32_f64(svfloat64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s64_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s64_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s64_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s64_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_s16(
@@ -519,14 +1530,44 @@ svint64_t test_svreinterpret_s64_s8(svint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s64_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s64_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s64_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s64_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_s32(
@@ -534,27 +1575,81 @@ svint64_t test_svreinterpret_s64_s16(svint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s64_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s64_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s64_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s64_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s64_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[OP:%.*]]
 //
-svint64_t test_svreinterpret_s64_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s64_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s64_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s64_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
+//
+TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_u8(
@@ -562,14 +1657,44 @@ svint64_t test_svreinterpret_s64_s64(svint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_s64_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_s64_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_s64_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_s64_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_u16(
@@ -577,14 +1702,44 @@ svint64_t test_svreinterpret_s64_u8(svuint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s64_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s64_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s64_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s64_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_u32(
@@ -592,27 +1747,81 @@ svint64_t test_svreinterpret_s64_u16(svuint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s64_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s64_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s64_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s64_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s64_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[OP:%.*]]
 //
-svint64_t test_svreinterpret_s64_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s64_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s64_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s64_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
+//
+TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_f16(
@@ -620,14 +1829,44 @@ svint64_t test_svreinterpret_s64_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s64_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s64_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s64_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s64_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_f32(
@@ -635,14 +1874,44 @@ svint64_t test_svreinterpret_s64_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s64_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s64_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s64_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s64_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_s64_f64(
@@ -650,27 +1919,81 @@ svint64_t test_svreinterpret_s64_f32(svfloat32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_s64_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_s64_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_s64_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_s64_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svreinterpret_s64_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_s64_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_s64_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_s64_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_s64,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_s64,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z24test_svreinterpret_u8_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[OP:%.*]]
 //
-svuint8_t test_svreinterpret_u8_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z24test_svreinterpret_u8_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z24test_svreinterpret_u8_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z24test_svreinterpret_u8_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
+//
+TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_s16(
@@ -678,14 +2001,44 @@ svuint8_t test_svreinterpret_u8_s8(svint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u8_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u8_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u8_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u8_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_s32(
@@ -693,14 +2046,44 @@ svuint8_t test_svreinterpret_u8_s16(svint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u8_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u8_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u8_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u8_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_s64(
@@ -708,27 +2091,81 @@ svuint8_t test_svreinterpret_u8_s32(svint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u8_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u8_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u8_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u8_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z24test_svreinterpret_u8_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[OP:%.*]]
 //
-svuint8_t test_svreinterpret_u8_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z24test_svreinterpret_u8_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z24test_svreinterpret_u8_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z24test_svreinterpret_u8_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
+//
+TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_u16(
@@ -736,14 +2173,44 @@ svuint8_t test_svreinterpret_u8_u8(svuint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u8_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u8_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u8_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u8_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_u32(
@@ -751,14 +2218,44 @@ svuint8_t test_svreinterpret_u8_u16(svuint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u8_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u8_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u8_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u8_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_u64(
@@ -766,14 +2263,44 @@ svuint8_t test_svreinterpret_u8_u32(svuint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u8_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u8_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u8_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u8_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_f16(
@@ -781,14 +2308,44 @@ svuint8_t test_svreinterpret_u8_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u8_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u8_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u8_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u8_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_f32(
@@ -796,14 +2353,44 @@ svuint8_t test_svreinterpret_u8_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u8_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u8_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u8_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u8_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u8_f64(
@@ -811,14 +2398,44 @@ svuint8_t test_svreinterpret_u8_f32(svfloat32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 16 x i8>
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u8_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 32 x i8>
+// TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u8_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 48 x i8>
+// TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u8_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 64 x i8>
+// TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u8_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 16 x i8>
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svreinterpret_u8_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u8_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 32 x i8>
+// CPP-TUPLE2-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u8_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 48 x i8>
+// CPP-TUPLE3-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u8_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 64 x i8>
+// CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u8,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u8,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_s8(
@@ -826,27 +2443,81 @@ svuint8_t test_svreinterpret_u8_f64(svfloat64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u16_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u16_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u16_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u16_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u16_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[OP:%.*]]
 //
-svuint16_t test_svreinterpret_u16_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u16_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u16_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u16_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
+//
+TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_s32(
@@ -854,14 +2525,44 @@ svuint16_t test_svreinterpret_u16_s16(svint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u16_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u16_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u16_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u16_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_s64(
@@ -869,14 +2570,44 @@ svuint16_t test_svreinterpret_u16_s32(svint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u16_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u16_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u16_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u16_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_u8(
@@ -884,27 +2615,81 @@ svuint16_t test_svreinterpret_u16_s64(svint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u16_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u16_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u16_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u16_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u16_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[OP:%.*]]
 //
-svuint16_t test_svreinterpret_u16_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u16_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u16_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u16_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
+//
+TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_u32(
@@ -912,14 +2697,44 @@ svuint16_t test_svreinterpret_u16_u16(svuint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u16_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u16_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u16_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u16_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_u64(
@@ -927,14 +2742,44 @@ svuint16_t test_svreinterpret_u16_u32(svuint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u16_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u16_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u16_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u16_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_f16(
@@ -942,14 +2787,44 @@ svuint16_t test_svreinterpret_u16_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u16_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u16_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u16_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u16_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_f32(
@@ -957,14 +2832,44 @@ svuint16_t test_svreinterpret_u16_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u16_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u16_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u16_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u16_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u16_f64(
@@ -972,14 +2877,44 @@ svuint16_t test_svreinterpret_u16_f32(svfloat32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u16_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 16 x i16>
+// TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u16_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 24 x i16>
+// TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u16_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x i16>
+// TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u16_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svreinterpret_u16_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u16_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 16 x i16>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u16_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 24 x i16>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u16_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x i16>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u16,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u16,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_s8(
@@ -987,14 +2922,44 @@ svuint16_t test_svreinterpret_u16_f64(svfloat64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u32_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u32_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u32_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u32_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_s16(
@@ -1002,27 +2967,81 @@ svuint32_t test_svreinterpret_u32_s8(svint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u32_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u32_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u32_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u32_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u32_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[OP:%.*]]
 //
-svuint32_t test_svreinterpret_u32_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u32_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u32_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u32_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
+//
+TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_s64(
@@ -1030,14 +3049,44 @@ svuint32_t test_svreinterpret_u32_s32(svint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u32_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u32_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u32_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u32_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_u8(
@@ -1045,14 +3094,44 @@ svuint32_t test_svreinterpret_u32_s64(svint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u32_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u32_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u32_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u32_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_u16(
@@ -1060,27 +3139,81 @@ svuint32_t test_svreinterpret_u32_u8(svuint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u32_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u32_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u32_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u32_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u32_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[OP:%.*]]
 //
-svuint32_t test_svreinterpret_u32_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u32_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u32_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u32_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
+//
+TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_u64(
@@ -1088,14 +3221,44 @@ svuint32_t test_svreinterpret_u32_u32(svuint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u32_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u32_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u32_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u32_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_f16(
@@ -1103,14 +3266,44 @@ svuint32_t test_svreinterpret_u32_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u32_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u32_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u32_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u32_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_f32(
@@ -1118,14 +3311,44 @@ svuint32_t test_svreinterpret_u32_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u32_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u32_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u32_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u32_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u32_f64(
@@ -1133,14 +3356,44 @@ svuint32_t test_svreinterpret_u32_f32(svfloat32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u32_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 8 x i32>
+// TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u32_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 12 x i32>
+// TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u32_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x i32>
+// TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u32_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svreinterpret_u32_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u32_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 8 x i32>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u32_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 12 x i32>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u32_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x i32>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u32,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u32,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_s8(
@@ -1148,14 +3401,44 @@ svuint32_t test_svreinterpret_u32_f64(svfloat64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u64_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u64_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u64_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u64_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_s16(
@@ -1163,14 +3446,44 @@ svuint64_t test_svreinterpret_u64_s8(svint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u64_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u64_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u64_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u64_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_s32(
@@ -1178,27 +3491,81 @@ svuint64_t test_svreinterpret_u64_s16(svint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u64_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u64_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u64_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u64_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u64_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[OP:%.*]]
 //
-svuint64_t test_svreinterpret_u64_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u64_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u64_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u64_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
+//
+TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_u8(
@@ -1206,14 +3573,44 @@ svuint64_t test_svreinterpret_u64_s64(svint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_u64_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_u64_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_u64_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_u64_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_u16(
@@ -1221,14 +3618,44 @@ svuint64_t test_svreinterpret_u64_u8(svuint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u64_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u64_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u64_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u64_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_u32(
@@ -1236,27 +3663,81 @@ svuint64_t test_svreinterpret_u64_u16(svuint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u64_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u64_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u64_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u64_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u64_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[OP:%.*]]
 //
-svuint64_t test_svreinterpret_u64_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u64_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u64_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u64_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
+//
+TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_f16(
@@ -1264,14 +3745,44 @@ svuint64_t test_svreinterpret_u64_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u64_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u64_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u64_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u64_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_f32(
@@ -1279,14 +3790,44 @@ svuint64_t test_svreinterpret_u64_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u64_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u64_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u64_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u64_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_u64_f64(
@@ -1294,14 +3835,44 @@ svuint64_t test_svreinterpret_u64_f32(svfloat32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_u64_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 4 x i64>
+// TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_u64_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 6 x i64>
+// TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_u64_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 8 x i64>
+// TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_u64_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svreinterpret_u64_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_u64_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 4 x i64>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_u64_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 6 x i64>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_u64_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 8 x i64>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_u64,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_u64,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_s8(
@@ -1309,14 +3880,44 @@ svuint64_t test_svreinterpret_u64_f64(svfloat64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_f16_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_f16_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_f16_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_f16_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_s16(
@@ -1324,14 +3925,44 @@ svfloat16_t test_svreinterpret_f16_s8(svint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f16_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f16_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f16_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f16_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_s32(
@@ -1339,14 +3970,44 @@ svfloat16_t test_svreinterpret_f16_s16(svint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f16_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f16_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f16_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f16_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_s64(
@@ -1354,14 +4015,44 @@ svfloat16_t test_svreinterpret_f16_s32(svint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f16_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f16_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f16_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f16_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_u8(
@@ -1369,14 +4060,44 @@ svfloat16_t test_svreinterpret_f16_s64(svint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_f16_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_f16_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_f16_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_f16_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_u16(
@@ -1384,14 +4105,44 @@ svfloat16_t test_svreinterpret_f16_u8(svuint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f16_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f16_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f16_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f16_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_u32(
@@ -1399,14 +4150,44 @@ svfloat16_t test_svreinterpret_f16_u16(svuint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f16_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f16_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f16_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f16_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_u64(
@@ -1414,27 +4195,81 @@ svfloat16_t test_svreinterpret_f16_u32(svuint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f16_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f16_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f16_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f16_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 8 x half> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f16_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[OP:%.*]]
 //
-svfloat16_t test_svreinterpret_f16_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f16_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f16_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f16_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[OP:%.*]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_f32(
@@ -1442,14 +4277,44 @@ svfloat16_t test_svreinterpret_f16_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f16_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f16_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f16_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f16_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f16_f64(
@@ -1457,14 +4322,44 @@ svfloat16_t test_svreinterpret_f16_f32(svfloat32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x half>
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f16_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 16 x half>
+// TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f16_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 24 x half>
+// TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f16_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x half>
+// TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f16_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 8 x half>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svreinterpret_f16_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f16_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 16 x half>
+// CPP-TUPLE2-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f16_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 24 x half>
+// CPP-TUPLE3-NEXT:    ret <vscale x 24 x half> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f16_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x half>
+// CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f16,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f16,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_s8(
@@ -1472,14 +4367,44 @@ svfloat16_t test_svreinterpret_f16_f64(svfloat64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_f32_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_f32_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_f32_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_f32_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_s16(
@@ -1487,14 +4412,44 @@ svfloat32_t test_svreinterpret_f32_s8(svint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f32_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f32_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f32_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f32_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_s32(
@@ -1502,14 +4457,44 @@ svfloat32_t test_svreinterpret_f32_s16(svint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f32_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f32_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f32_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f32_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_s64(
@@ -1517,14 +4502,44 @@ svfloat32_t test_svreinterpret_f32_s32(svint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f32_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f32_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f32_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f32_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_u8(
@@ -1532,14 +4547,44 @@ svfloat32_t test_svreinterpret_f32_s64(svint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_f32_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_f32_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_f32_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_f32_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_u16(
@@ -1547,14 +4592,44 @@ svfloat32_t test_svreinterpret_f32_u8(svuint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f32_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f32_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f32_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f32_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_u32(
@@ -1562,14 +4637,44 @@ svfloat32_t test_svreinterpret_f32_u16(svuint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f32_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f32_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f32_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f32_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_u64(
@@ -1577,14 +4682,44 @@ svfloat32_t test_svreinterpret_f32_u32(svuint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f32_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f32_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f32_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f32_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_f16(
@@ -1592,27 +4727,81 @@ svfloat32_t test_svreinterpret_f32_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f32_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f32_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f32_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f32_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 4 x float> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f32_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[OP:%.*]]
 //
-svfloat32_t test_svreinterpret_f32_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f32_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f32_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f32_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[OP:%.*]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f32_f64(
@@ -1620,14 +4809,44 @@ svfloat32_t test_svreinterpret_f32_f32(svfloat32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 4 x float>
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f32_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 8 x float>
+// TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f32_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 12 x float>
+// TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f32_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x float>
+// TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f32_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x double> [[OP:%.*]] to <vscale x 4 x float>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svreinterpret_f32_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f32_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x double> [[OP:%.*]] to <vscale x 8 x float>
+// CPP-TUPLE2-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f32_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x double> [[OP:%.*]] to <vscale x 12 x float>
+// CPP-TUPLE3-NEXT:    ret <vscale x 12 x float> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f32_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x float>
+// CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f32,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f32,_f64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_s8(
@@ -1635,14 +4854,44 @@ svfloat32_t test_svreinterpret_f32_f64(svfloat64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_s8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_s8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_s8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_f64_s8u10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_s8(svint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_f64_s810svint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_f64_s810svint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_f64_s810svint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_s8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_s8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_s16(
@@ -1650,14 +4899,44 @@ svfloat64_t test_svreinterpret_f64_s8(svint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_s16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_s16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_s16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f64_s16u11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_s16(svint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f64_s1611svint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f64_s1611svint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f64_s1611svint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_s16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_s16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_s32(
@@ -1665,14 +4944,44 @@ svfloat64_t test_svreinterpret_f64_s16(svint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_s32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_s32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_s32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f64_s32u11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_s32(svint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f64_s3211svint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f64_s3211svint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f64_s3211svint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_s32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_s32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_s64(
@@ -1680,14 +4989,44 @@ svfloat64_t test_svreinterpret_f64_s32(svint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_s64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_s64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_s64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f64_s64u11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_s64(svint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f64_s6411svint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f64_s6411svint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f64_s6411svint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_s64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_s64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_u8(
@@ -1695,14 +5034,44 @@ svfloat64_t test_svreinterpret_f64_s64(svint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_u8(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_u8(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_u8(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z25test_svreinterpret_f64_u8u11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_u8(svuint8_t op)
+// CPP-TUPLE2-LABEL: @_Z25test_svreinterpret_f64_u811svuint8x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i8> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z25test_svreinterpret_f64_u811svuint8x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 48 x i8> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z25test_svreinterpret_f64_u811svuint8x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_u8,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_u8)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_u16(
@@ -1710,14 +5079,44 @@ svfloat64_t test_svreinterpret_f64_u8(svuint8_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_u16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_u16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_u16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f64_u16u12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i16> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_u16(svuint16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f64_u1612svuint16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i16> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f64_u1612svuint16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x i16> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f64_u1612svuint16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_u16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_u16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_u32(
@@ -1725,14 +5124,44 @@ svfloat64_t test_svreinterpret_f64_u16(svuint16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_u32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_u32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_u32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f64_u32u12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_u32(svuint32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f64_u3212svuint32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i32> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f64_u3212svuint32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x i32> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f64_u3212svuint32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_u32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_u32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_u64(
@@ -1740,14 +5169,44 @@ svfloat64_t test_svreinterpret_f64_u32(svuint32_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_u64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_u64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_u64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f64_u64u12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 2 x i64> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_u64(svuint64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f64_u6412svuint64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i64> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f64_u6412svuint64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 6 x i64> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f64_u6412svuint64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_u64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_u64)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_f16(
@@ -1755,14 +5214,44 @@ svfloat64_t test_svreinterpret_f64_u64(svuint64_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_f16(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_f16(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_f16(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f64_f16u13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x half> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_f16(svfloat16_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f64_f1613svfloat16x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x half> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f64_f1613svfloat16x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 24 x half> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f64_f1613svfloat16x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_f16,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_f16)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_f32(
@@ -1770,25 +5259,79 @@ svfloat64_t test_svreinterpret_f64_f16(svfloat16_t op)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 2 x double>
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_f32(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 4 x double>
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_f32(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 6 x double>
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_f32(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x double>
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f64_f32u13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x float> [[OP:%.*]] to <vscale x 2 x double>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svreinterpret_f64_f32(svfloat32_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f64_f3213svfloat32x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x float> [[OP:%.*]] to <vscale x 4 x double>
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f64_f3213svfloat32x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 12 x float> [[OP:%.*]] to <vscale x 6 x double>
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[TMP0]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f64_f3213svfloat32x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x double>
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_f32,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_f32)(op);
 }
 
 // CHECK-LABEL: @test_svreinterpret_f64_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <vscale x 2 x double> [[OP:%.*]]
 //
+// TUPLE2-LABEL: @test_svreinterpret_f64_f64(
+// TUPLE2-NEXT:  entry:
+// TUPLE2-NEXT:    ret <vscale x 4 x double> [[OP:%.*]]
+//
+// TUPLE3-LABEL: @test_svreinterpret_f64_f64(
+// TUPLE3-NEXT:  entry:
+// TUPLE3-NEXT:    ret <vscale x 6 x double> [[OP:%.*]]
+//
+// TUPLE4-LABEL: @test_svreinterpret_f64_f64(
+// TUPLE4-NEXT:  entry:
+// TUPLE4-NEXT:    ret <vscale x 8 x double> [[OP:%.*]]
+//
 // CPP-CHECK-LABEL: @_Z26test_svreinterpret_f64_f64u13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[OP:%.*]]
 //
-svfloat64_t test_svreinterpret_f64_f64(svfloat64_t op)
+// CPP-TUPLE2-LABEL: @_Z26test_svreinterpret_f64_f6413svfloat64x2_t(
+// CPP-TUPLE2-NEXT:  entry:
+// CPP-TUPLE2-NEXT:    ret <vscale x 4 x double> [[OP:%.*]]
+//
+// CPP-TUPLE3-LABEL: @_Z26test_svreinterpret_f64_f6413svfloat64x3_t(
+// CPP-TUPLE3-NEXT:  entry:
+// CPP-TUPLE3-NEXT:    ret <vscale x 6 x double> [[OP:%.*]]
+//
+// CPP-TUPLE4-LABEL: @_Z26test_svreinterpret_f64_f6413svfloat64x4_t(
+// CPP-TUPLE4-NEXT:  entry:
+// CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[OP:%.*]]
+//
+TYPE(svfloat64) test_svreinterpret_f64_f64(TYPE(svfloat64) op)
 {
-  return SVE_ACLE_FUNC(svreinterpret_f64,_f64,,)(op);
+  return SVE_ACLE_FUNC(svreinterpret_f64,_f64)(op);
 }
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 368908e79bf1963..6c82368c3c7229c 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -23,16 +23,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/TableGen/Record.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/TableGen/Error.h"
-#include <string>
-#include <sstream>
-#include <set>
+#include "llvm/TableGen/Record.h"
+#include <array>
 #include <cctype>
+#include <set>
+#include <sstream>
+#include <string>
 #include <tuple>
 
 using namespace llvm;
@@ -64,26 +65,29 @@ class ImmCheck {
 };
 
 class SVEType {
-  TypeSpec TS;
   bool Float, Signed, Immediate, Void, Constant, Pointer, BFloat;
   bool DefaultType, IsScalable, Predicate, PredicatePattern, PrefetchOp,
       Svcount;
   unsigned Bitwidth, ElementBitwidth, NumVectors;
 
 public:
-  SVEType() : SVEType(TypeSpec(), 'v') {}
+  SVEType() : SVEType("", 'v') {}
 
-  SVEType(TypeSpec TS, char CharMod, unsigned NumVectors = 1)
-      : TS(TS), Float(false), Signed(true), Immediate(false), Void(false),
+  SVEType(StringRef TS, char CharMod, unsigned NumVectors = 1)
+      : Float(false), Signed(true), Immediate(false), Void(false),
         Constant(false), Pointer(false), BFloat(false), DefaultType(false),
         IsScalable(true), Predicate(false), PredicatePattern(false),
         PrefetchOp(false), Svcount(false), Bitwidth(128), ElementBitwidth(~0U),
         NumVectors(NumVectors) {
     if (!TS.empty())
-      applyTypespec();
+      applyTypespec(TS);
     applyModifier(CharMod);
   }
 
+  SVEType(const SVEType &Base, unsigned NumV) : SVEType(Base) {
+    NumVectors = NumV;
+  }
+
   bool isPointer() const { return Pointer; }
   bool isVoidPointer() const { return Pointer && Void; }
   bool isSigned() const { return Signed; }
@@ -129,13 +133,12 @@ class SVEType {
 
 private:
   /// Creates the type based on the typespec string in TS.
-  void applyTypespec();
+  void applyTypespec(StringRef TS);
 
   /// Applies a prototype modifier to the type.
   void applyModifier(char Mod);
 };
 
-
 class SVEEmitter;
 
 /// The main grunt class. This represents an instantiation of an intrinsic with
@@ -263,17 +266,11 @@ class SVEEmitter {
   // which is inconvenient to specify in the arm_sve.td file or
   // generate in CGBuiltin.cpp.
   struct ReinterpretTypeInfo {
+    SVEType BaseType;
     const char *Suffix;
-    const char *Type;
-    const char *BuiltinType;
   };
-  SmallVector<ReinterpretTypeInfo, 12> Reinterprets = {
-      {"s8", "svint8_t", "q16Sc"},   {"s16", "svint16_t", "q8Ss"},
-      {"s32", "svint32_t", "q4Si"},  {"s64", "svint64_t", "q2SWi"},
-      {"u8", "svuint8_t", "q16Uc"},  {"u16", "svuint16_t", "q8Us"},
-      {"u32", "svuint32_t", "q4Ui"}, {"u64", "svuint64_t", "q2UWi"},
-      {"f16", "svfloat16_t", "q8h"}, {"bf16", "svbfloat16_t", "q8y"},
-      {"f32", "svfloat32_t", "q4f"}, {"f64", "svfloat64_t", "q2d"}};
+
+  static const std::array<ReinterpretTypeInfo, 12> Reinterprets;
 
   RecordKeeper &Records;
   llvm::StringMap<uint64_t> EltTypes;
@@ -383,6 +380,20 @@ class SVEEmitter {
                        SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out);
 };
 
+const std::array<SVEEmitter::ReinterpretTypeInfo, 12> SVEEmitter::Reinterprets =
+    {{{SVEType("c", 'd'), "s8"},
+      {SVEType("Uc", 'd'), "u8"},
+      {SVEType("s", 'd'), "s16"},
+      {SVEType("Us", 'd'), "u16"},
+      {SVEType("i", 'd'), "s32"},
+      {SVEType("Ui", 'd'), "u32"},
+      {SVEType("l", 'd'), "s64"},
+      {SVEType("Ul", 'd'), "u64"},
+      {SVEType("h", 'd'), "f16"},
+      {SVEType("b", 'd'), "bf16"},
+      {SVEType("f", 'd'), "f32"},
+      {SVEType("d", 'd'), "f64"}}};
+
 } // end anonymous namespace
 
 
@@ -497,7 +508,8 @@ std::string SVEType::str() const {
 
   return S;
 }
-void SVEType::applyTypespec() {
+
+void SVEType::applyTypespec(StringRef TS) {
   for (char I : TS) {
     switch (I) {
     case 'Q':
@@ -1315,21 +1327,28 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
         "__nodebug__, __overloadable__))\n\n";
 
   // Add reinterpret functions.
-  for (auto ShortForm : { false, true } )
-    for (const ReinterpretTypeInfo &From : Reinterprets)
+  for (auto [N, Suffix] :
+       std::initializer_list<std::pair<unsigned, const char *>>{
+           {1, ""}, {2, "_x2"}, {3, "_x3"}, {4, "_x4"}}) {
+    for (auto ShortForm : {false, true})
       for (const ReinterpretTypeInfo &To : Reinterprets) {
-        if (ShortForm) {
-          OS << "__aio __attribute__((target(\"sve\"))) " << From.Type
-             << " svreinterpret_" << From.Suffix;
-          OS << "(" << To.Type << " op) __arm_streaming_compatible {\n";
-          OS << "  return __builtin_sve_reinterpret_" << From.Suffix << "_"
-             << To.Suffix << "(op);\n";
-          OS << "}\n\n";
-        } else
-          OS << "#define svreinterpret_" << From.Suffix << "_" << To.Suffix
-             << "(...) __builtin_sve_reinterpret_" << From.Suffix << "_"
-             << To.Suffix << "(__VA_ARGS__)\n";
+        SVEType ToV(To.BaseType, N);
+        for (const ReinterpretTypeInfo &From : Reinterprets) {
+          SVEType FromV(From.BaseType, N);
+          if (ShortForm) {
+            OS << "__aio __attribute__((target(\"sve\"))) " << ToV.str()
+               << " svreinterpret_" << To.Suffix;
+            OS << "(" << FromV.str() << " op) __arm_streaming_compatible {\n";
+            OS << "  return __builtin_sve_reinterpret_" << To.Suffix << "_"
+               << From.Suffix << Suffix << "(op);\n";
+            OS << "}\n\n";
+          } else
+            OS << "#define svreinterpret_" << To.Suffix << "_" << From.Suffix
+               << Suffix << "(...) __builtin_sve_reinterpret_" << To.Suffix
+               << "_" << From.Suffix << Suffix << "(__VA_ARGS__)\n";
+        }
       }
+  }
 
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
@@ -1394,12 +1413,20 @@ void SVEEmitter::createBuiltins(raw_ostream &OS) {
          << "\")\n";
   }
 
-  // Add reinterpret builtins
-  for (const ReinterpretTypeInfo &From : Reinterprets)
-    for (const ReinterpretTypeInfo &To : Reinterprets)
-      OS << "TARGET_BUILTIN(__builtin_sve_reinterpret_" << From.Suffix << "_"
-         << To.Suffix << +", \"" << From.BuiltinType << To.BuiltinType
-         << "\", \"n\", \"sve\")\n";
+  // Add reinterpret functions.
+  for (auto [N, Suffix] :
+       std::initializer_list<std::pair<unsigned, const char *>>{
+           {1, ""}, {2, "_x2"}, {3, "_x3"}, {4, "_x4"}}) {
+    for (const ReinterpretTypeInfo &To : Reinterprets) {
+      SVEType ToV(To.BaseType, N);
+      for (const ReinterpretTypeInfo &From : Reinterprets) {
+        SVEType FromV(From.BaseType, N);
+        OS << "TARGET_BUILTIN(__builtin_sve_reinterpret_" << To.Suffix << "_"
+           << From.Suffix << Suffix << +", \"" << ToV.builtin_str()
+           << FromV.builtin_str() << "\", \"n\", \"sve\")\n";
+      }
+    }
+  }
 
   OS << "#endif\n\n";
 }

>From 97a238e863fdf1f1a54e4bb609e130a6456a4f61 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann at gmail.com>
Date: Fri, 3 Nov 2023 13:02:35 +0100
Subject: [PATCH 16/76] [MLIR][LLVM] Remove typed pointer conversion utils
 (#71169)

This commit removes the no longer required type pointer helpers from the
LLVM dialect conversion utils. Typed pointers have been deprecated for a
while now and it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 .../Conversion/LLVMCommon/LoweringOptions.h   |  1 -
 .../Conversion/LLVMCommon/TypeConverter.h     | 11 ----
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   | 24 +++----
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp |  2 +-
 .../Conversion/GPUCommon/GPUOpsLowering.cpp   | 45 ++++++-------
 .../Conversion/LLVMCommon/MemRefBuilder.cpp   | 25 ++++----
 mlir/lib/Conversion/LLVMCommon/Pattern.cpp    | 17 ++---
 .../Conversion/LLVMCommon/PrintCallHelper.cpp | 14 ++--
 .../Conversion/LLVMCommon/TypeConverter.cpp   | 28 +++-----
 .../MemRefToLLVM/AllocLikeConversion.cpp      |  4 +-
 .../Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp    |  3 +-
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   | 64 ++++++++-----------
 .../NVGPU/TransformOps/NVGPUTransformOps.cpp  |  3 +-
 13 files changed, 92 insertions(+), 149 deletions(-)

diff --git a/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h b/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h
index cc4e17e9527f01e..c94892fd4f8164c 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h
@@ -33,7 +33,6 @@ class LowerToLLVMOptions {
   LowerToLLVMOptions(MLIRContext *ctx, const DataLayout &dl);
 
   bool useBarePtrCallConv = false;
-  bool useOpaquePointers = true;
 
   enum class AllocLowering {
     /// Use malloc for heap allocations.
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
index 2a4327535c68750..74f9c977b702860 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
@@ -125,17 +125,6 @@ class LLVMTypeConverter : public TypeConverter {
   /// integer type with the size configured for this type converter.
   Type getIndexType() const;
 
-  /// Returns true if using opaque pointers was enabled in the lowering options.
-  bool useOpaquePointers() const { return getOptions().useOpaquePointers; }
-
-  /// Creates an LLVM pointer type with the given element type and address
-  /// space.
-  /// This function is meant to be used in code supporting both typed and opaque
-  /// pointers, as it will create an opaque pointer with the given address space
-  /// if opaque pointers are enabled in the lowering options.
-  LLVM::LLVMPointerType getPointerType(Type elementType,
-                                       unsigned addressSpace = 0) const;
-
   /// Gets the bitwidth of the index type when converted to LLVM.
   unsigned getIndexTypeBitwidth() const { return options.getIndexBitwidth(); }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 8da609755f6cae6..123ce36cb0a7951 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -43,25 +43,21 @@ LLVM::LLVMFuncOp lookupOrCreatePrintF64Fn(ModuleOp moduleOp);
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
 LLVM::LLVMFuncOp
-lookupOrCreatePrintStringFn(ModuleOp moduleOp, bool opaquePointers,
+lookupOrCreatePrintStringFn(ModuleOp moduleOp,
                             std::optional<StringRef> runtimeFunctionName = {});
 LLVM::LLVMFuncOp lookupOrCreatePrintOpenFn(ModuleOp moduleOp);
 LLVM::LLVMFuncOp lookupOrCreatePrintCloseFn(ModuleOp moduleOp);
 LLVM::LLVMFuncOp lookupOrCreatePrintCommaFn(ModuleOp moduleOp);
 LLVM::LLVMFuncOp lookupOrCreatePrintNewlineFn(ModuleOp moduleOp);
-LLVM::LLVMFuncOp lookupOrCreateMallocFn(ModuleOp moduleOp, Type indexType,
-                                        bool opaquePointers = true);
-LLVM::LLVMFuncOp lookupOrCreateAlignedAllocFn(ModuleOp moduleOp, Type indexType,
-                                              bool opaquePointers = true);
-LLVM::LLVMFuncOp lookupOrCreateFreeFn(ModuleOp moduleOp,
-                                      bool opaquePointers = true);
-LLVM::LLVMFuncOp lookupOrCreateGenericAllocFn(ModuleOp moduleOp, Type indexType,
-                                              bool opaquePointers = true);
-LLVM::LLVMFuncOp
-lookupOrCreateGenericAlignedAllocFn(ModuleOp moduleOp, Type indexType,
-                                    bool opaquePointers = true);
-LLVM::LLVMFuncOp lookupOrCreateGenericFreeFn(ModuleOp moduleOp,
-                                             bool opaquePointers = true);
+LLVM::LLVMFuncOp lookupOrCreateMallocFn(ModuleOp moduleOp, Type indexType);
+LLVM::LLVMFuncOp lookupOrCreateAlignedAllocFn(ModuleOp moduleOp,
+                                              Type indexType);
+LLVM::LLVMFuncOp lookupOrCreateFreeFn(ModuleOp moduleOp);
+LLVM::LLVMFuncOp lookupOrCreateGenericAllocFn(ModuleOp moduleOp,
+                                              Type indexType);
+LLVM::LLVMFuncOp lookupOrCreateGenericAlignedAllocFn(ModuleOp moduleOp,
+                                                     Type indexType);
+LLVM::LLVMFuncOp lookupOrCreateGenericFreeFn(ModuleOp moduleOp);
 LLVM::LLVMFuncOp lookupOrCreateMemRefCopyFn(ModuleOp moduleOp, Type indexType,
                                             Type unrankedDescriptorType);
 
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 84e145c98e971e7..bd50c67fb87958a 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -241,7 +241,7 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
                     builder, loc, typeConverter, unrankedMemRefType,
                     wrapperArgsRange.take_front(numToDrop));
 
-      auto ptrTy = typeConverter.getPointerType(packed.getType());
+      auto ptrTy = LLVM::LLVMPointerType::get(builder.getContext());
       Value one = builder.create<LLVM::ConstantOp>(
           loc, typeConverter.convertType(builder.getIndexType()),
           builder.getIntegerAttr(builder.getIndexType(), 1));
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 6d2585aa30ab4c5..a747e9742d4fb72 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -106,18 +106,13 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
 
     for (const auto &en : llvm::enumerate(workgroupBuffers)) {
       LLVM::GlobalOp global = en.value();
+      auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
+                                                global.getAddrSpace());
       Value address = rewriter.create<LLVM::AddressOfOp>(
-          loc,
-          getTypeConverter()->getPointerType(global.getType(),
-                                             global.getAddrSpace()),
-          global.getSymNameAttr());
-      auto elementType =
-          cast<LLVM::LLVMArrayType>(global.getType()).getElementType();
-      Value memory = rewriter.create<LLVM::GEPOp>(
-          loc,
-          getTypeConverter()->getPointerType(elementType,
-                                             global.getAddrSpace()),
-          global.getType(), address, ArrayRef<LLVM::GEPArg>{0, 0});
+          loc, ptrType, global.getSymNameAttr());
+      Value memory =
+          rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(), address,
+                                       ArrayRef<LLVM::GEPArg>{0, 0});
 
       // Build a memref descriptor pointing to the buffer to plug with the
       // existing memref infrastructure. This may use more registers than
@@ -143,7 +138,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
       // memory space and does not support `alloca`s with addrspace(5).
       Type elementType = typeConverter->convertType(type.getElementType());
       auto ptrType =
-          getTypeConverter()->getPointerType(elementType, allocaAddrSpace);
+          LLVM::LLVMPointerType::get(rewriter.getContext(), allocaAddrSpace);
       Value numElements = rewriter.create<LLVM::ConstantOp>(
           gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
       uint64_t alignment = 0;
@@ -275,7 +270,7 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
   Location loc = gpuPrintfOp->getLoc();
 
   mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
-  mlir::Type i8Ptr = getTypeConverter()->getPointerType(llvmI8);
+  auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
   mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
   mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
   // Note: this is the GPUModule op, not the ModuleOp that surrounds it
@@ -298,7 +293,7 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
       moduleOp, loc, rewriter, "__ockl_printf_append_string_n",
       LLVM::LLVMFunctionType::get(
           llvmI64,
-          {llvmI64, i8Ptr, /*length (bytes)*/ llvmI64, /*isLast*/ llvmI32}));
+          {llvmI64, ptrType, /*length (bytes)*/ llvmI64, /*isLast*/ llvmI32}));
 
   /// Start the printf hostcall
   Value zeroI64 = rewriter.create<LLVM::ConstantOp>(loc, llvmI64, 0);
@@ -326,10 +321,10 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
   // Get a pointer to the format string's first element and pass it to printf()
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
       loc,
-      getTypeConverter()->getPointerType(globalType, global.getAddrSpace()),
+      LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
       global.getSymNameAttr());
   Value stringStart = rewriter.create<LLVM::GEPOp>(
-      loc, i8Ptr, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+      loc, ptrType, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
   Value stringLen =
       rewriter.create<LLVM::ConstantOp>(loc, llvmI64, formatStringSize);
 
@@ -386,15 +381,17 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
   Location loc = gpuPrintfOp->getLoc();
 
   mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
-  mlir::Type i8Ptr = getTypeConverter()->getPointerType(llvmI8, addressSpace);
+  mlir::Type ptrType =
+      LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);
 
   // Note: this is the GPUModule op, not the ModuleOp that surrounds it
   // This ensures that global constants and declarations are placed within
   // the device code, not the host code
   auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
 
-  auto printfType = LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {i8Ptr},
-                                                /*isVarArg=*/true);
+  auto printfType =
+      LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType},
+                                  /*isVarArg=*/true);
   LLVM::LLVMFuncOp printfDecl =
       getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);
 
@@ -418,10 +415,10 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
       loc,
-      getTypeConverter()->getPointerType(globalType, global.getAddrSpace()),
+      LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
       global.getSymNameAttr());
   Value stringStart = rewriter.create<LLVM::GEPOp>(
-      loc, i8Ptr, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+      loc, ptrType, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
 
   // Construct arguments and function call
   auto argsRange = adaptor.getArgs();
@@ -473,8 +470,7 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
   Value stringStart = rewriter.create<LLVM::GEPOp>(
-      loc, getTypeConverter()->getPointerType(globalType), globalType,
-      globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+      loc, ptrType, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
   SmallVector<Type> types;
   SmallVector<Value> args;
   // Promote and pack the arguments into a stack allocation.
@@ -498,8 +494,7 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
                                       /*alignment=*/0);
   for (auto [index, arg] : llvm::enumerate(args)) {
     Value ptr = rewriter.create<LLVM::GEPOp>(
-        loc, getTypeConverter()->getPointerType(structType), structType,
-        tempAlloc, ArrayRef<LLVM::GEPArg>{0, index});
+        loc, ptrType, structType, tempAlloc, ArrayRef<LLVM::GEPArg>{0, index});
     rewriter.create<LLVM::StoreOp>(loc, arg, ptr);
   }
   std::array<Value, 2> printfArgs = {stringStart, tempAlloc};
diff --git a/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp b/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
index 023fd6244ce1afb..da084b89ceadc25 100644
--- a/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
@@ -486,10 +486,10 @@ Value UnrankedMemRefDescriptor::size(OpBuilder &builder, Location loc,
                                      Value sizeBasePtr, Value index) {
 
   Type indexTy = typeConverter.getIndexType();
-  Type indexPtrTy = typeConverter.getPointerType(indexTy);
+  auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
   Value sizeStoreGep =
-      builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy, sizeBasePtr, index);
+      builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, sizeBasePtr, index);
   return builder.create<LLVM::LoadOp>(loc, indexTy, sizeStoreGep);
 }
 
@@ -498,10 +498,10 @@ void UnrankedMemRefDescriptor::setSize(OpBuilder &builder, Location loc,
                                        Value sizeBasePtr, Value index,
                                        Value size) {
   Type indexTy = typeConverter.getIndexType();
-  Type indexPtrTy = typeConverter.getPointerType(indexTy);
+  auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
   Value sizeStoreGep =
-      builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy, sizeBasePtr, index);
+      builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, sizeBasePtr, index);
   builder.create<LLVM::StoreOp>(loc, size, sizeStoreGep);
 }
 
@@ -509,10 +509,9 @@ Value UnrankedMemRefDescriptor::strideBasePtr(
     OpBuilder &builder, Location loc, const LLVMTypeConverter &typeConverter,
     Value sizeBasePtr, Value rank) {
   Type indexTy = typeConverter.getIndexType();
-  Type indexPtrTy = typeConverter.getPointerType(indexTy);
+  auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
-  return builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy, sizeBasePtr,
-                                     rank);
+  return builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, sizeBasePtr, rank);
 }
 
 Value UnrankedMemRefDescriptor::stride(OpBuilder &builder, Location loc,
@@ -520,10 +519,10 @@ Value UnrankedMemRefDescriptor::stride(OpBuilder &builder, Location loc,
                                        Value strideBasePtr, Value index,
                                        Value stride) {
   Type indexTy = typeConverter.getIndexType();
-  Type indexPtrTy = typeConverter.getPointerType(indexTy);
+  auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
-  Value strideStoreGep = builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy,
-                                                     strideBasePtr, index);
+  Value strideStoreGep =
+      builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, strideBasePtr, index);
   return builder.create<LLVM::LoadOp>(loc, indexTy, strideStoreGep);
 }
 
@@ -532,9 +531,9 @@ void UnrankedMemRefDescriptor::setStride(OpBuilder &builder, Location loc,
                                          Value strideBasePtr, Value index,
                                          Value stride) {
   Type indexTy = typeConverter.getIndexType();
-  Type indexPtrTy = typeConverter.getPointerType(indexTy);
+  auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
-  Value strideStoreGep = builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy,
-                                                     strideBasePtr, index);
+  Value strideStoreGep =
+      builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, strideBasePtr, index);
   builder.create<LLVM::StoreOp>(loc, stride, strideStoreGep);
 }
diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
index 40d4c97975a6a94..83c31a204efc7e0 100644
--- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
@@ -47,8 +47,7 @@ Type ConvertToLLVMPattern::getVoidType() const {
 }
 
 Type ConvertToLLVMPattern::getVoidPtrType() const {
-  return getTypeConverter()->getPointerType(
-      IntegerType::get(&getTypeConverter()->getContext(), 8));
+  return LLVM::LLVMPointerType::get(&getTypeConverter()->getContext());
 }
 
 Value ConvertToLLVMPattern::createIndexAttrConstant(OpBuilder &builder,
@@ -106,12 +105,10 @@ bool ConvertToLLVMPattern::isConvertibleAndHasIdentityMaps(
 }
 
 Type ConvertToLLVMPattern::getElementPtrType(MemRefType type) const {
-  auto elementType = type.getElementType();
-  auto structElementType = typeConverter->convertType(elementType);
   auto addressSpace = getTypeConverter()->getMemRefAddressSpace(type);
   if (failed(addressSpace))
     return {};
-  return getTypeConverter()->getPointerType(structElementType, *addressSpace);
+  return LLVM::LLVMPointerType::get(type.getContext(), *addressSpace);
 }
 
 void ConvertToLLVMPattern::getMemRefDescriptorSizes(
@@ -161,7 +158,7 @@ void ConvertToLLVMPattern::getMemRefDescriptorSizes(
   if (sizeInBytes) {
     // Buffer size in bytes.
     Type elementType = typeConverter->convertType(memRefType.getElementType());
-    Type elementPtrType = getTypeConverter()->getPointerType(elementType);
+    auto elementPtrType = LLVM::LLVMPointerType::get(rewriter.getContext());
     Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, elementPtrType);
     Value gepPtr = rewriter.create<LLVM::GEPOp>(
         loc, elementPtrType, elementType, nullPtr, runningStride);
@@ -179,7 +176,7 @@ Value ConvertToLLVMPattern::getSizeInBytes(
   //   %1 = ptrtoint %elementType* %0 to %indexType
   // which is a common pattern of getting the size of a type in bytes.
   Type llvmType = typeConverter->convertType(type);
-  auto convertedPtrType = getTypeConverter()->getPointerType(llvmType);
+  auto convertedPtrType = LLVM::LLVMPointerType::get(rewriter.getContext());
   auto nullPtr = rewriter.create<LLVM::ZeroOp>(loc, convertedPtrType);
   auto gep = rewriter.create<LLVM::GEPOp>(loc, convertedPtrType, llvmType,
                                           nullPtr, ArrayRef<LLVM::GEPArg>{1});
@@ -283,11 +280,9 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(
   auto module = builder.getInsertionPoint()->getParentOfType<ModuleOp>();
   LLVM::LLVMFuncOp freeFunc, mallocFunc;
   if (toDynamic)
-    mallocFunc = LLVM::lookupOrCreateMallocFn(
-        module, indexType, getTypeConverter()->useOpaquePointers());
+    mallocFunc = LLVM::lookupOrCreateMallocFn(module, indexType);
   if (!toDynamic)
-    freeFunc = LLVM::lookupOrCreateFreeFn(
-        module, getTypeConverter()->useOpaquePointers());
+    freeFunc = LLVM::lookupOrCreateFreeFn(module);
 
   unsigned unrankedMemrefPos = 0;
   for (unsigned i = 0, e = operands.size(); i < e; ++i) {
diff --git a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
index 6293643ac6f0349..bd7b401efec17a9 100644
--- a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
@@ -51,16 +51,16 @@ void mlir::LLVM::createPrintStrCall(
       loc, arrayTy, /*constant=*/true, LLVM::Linkage::Private,
       ensureSymbolNameIsUnique(moduleOp, symbolName), dataAttr);
 
+  auto ptrTy = LLVM::LLVMPointerType::get(builder.getContext());
   // Emit call to `printStr` in runtime library.
   builder.restoreInsertionPoint(ip);
-  auto msgAddr = builder.create<LLVM::AddressOfOp>(
-      loc, typeConverter.getPointerType(arrayTy), globalOp.getName());
+  auto msgAddr =
+      builder.create<LLVM::AddressOfOp>(loc, ptrTy, globalOp.getName());
   SmallVector<LLVM::GEPArg> indices(1, 0);
-  Value gep = builder.create<LLVM::GEPOp>(
-      loc, typeConverter.getPointerType(builder.getI8Type()), arrayTy, msgAddr,
-      indices);
-  Operation *printer = LLVM::lookupOrCreatePrintStringFn(
-      moduleOp, typeConverter.useOpaquePointers(), runtimeFunctionName);
+  Value gep =
+      builder.create<LLVM::GEPOp>(loc, ptrTy, arrayTy, msgAddr, indices);
+  Operation *printer =
+      LLVM::lookupOrCreatePrintStringFn(moduleOp, runtimeFunctionName);
   builder.create<LLVM::CallOp>(loc, TypeRange(), SymbolRefAttr::get(printer),
                                gep);
 }
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index fe3a8c6d410902b..35b95d7a5ebe925 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -209,14 +209,6 @@ Type LLVMTypeConverter::getIndexType() const {
   return IntegerType::get(&getContext(), getIndexTypeBitwidth());
 }
 
-LLVM::LLVMPointerType
-LLVMTypeConverter::getPointerType(Type elementType,
-                                  unsigned int addressSpace) const {
-  if (useOpaquePointers())
-    return LLVM::LLVMPointerType::get(&getContext(), addressSpace);
-  return LLVM::LLVMPointerType::get(elementType, addressSpace);
-}
-
 unsigned LLVMTypeConverter::getPointerBitwidth(unsigned addressSpace) const {
   return options.dataLayout.getPointerSizeInBits(addressSpace);
 }
@@ -249,12 +241,7 @@ Type LLVMTypeConverter::convertComplexType(ComplexType type) const {
 // Except for signatures, MLIR function types are converted into LLVM
 // pointer-to-function types.
 Type LLVMTypeConverter::convertFunctionType(FunctionType type) const {
-  SignatureConversion conversion(type.getNumInputs());
-  Type converted = convertFunctionSignature(
-      type, /*isVariadic=*/false, options.useBarePtrCallConv, conversion);
-  if (!converted)
-    return {};
-  return getPointerType(converted);
+  return LLVM::LLVMPointerType::get(type.getContext());
 }
 
 // Function types are converted to LLVM Function types by recursively converting
@@ -301,11 +288,12 @@ LLVMTypeConverter::convertFunctionTypeCWrapper(FunctionType type) const {
   if (!resultType)
     return {};
 
+  auto ptrType = LLVM::LLVMPointerType::get(type.getContext());
   auto structType = dyn_cast<LLVM::LLVMStructType>(resultType);
   if (structType) {
     // Struct types cannot be safely returned via C interface. Make this a
     // pointer argument, instead.
-    inputs.push_back(getPointerType(structType));
+    inputs.push_back(ptrType);
     resultType = LLVM::LLVMVoidType::get(&getContext());
   }
 
@@ -314,7 +302,7 @@ LLVMTypeConverter::convertFunctionTypeCWrapper(FunctionType type) const {
     if (!converted || !LLVM::isCompatibleType(converted))
       return {};
     if (isa<MemRefType, UnrankedMemRefType>(t))
-      converted = getPointerType(converted);
+      converted = ptrType;
     inputs.push_back(converted);
   }
 
@@ -373,7 +361,7 @@ LLVMTypeConverter::getMemRefDescriptorFields(MemRefType type,
            "failed. Consider adding memory space conversions.";
     return {};
   }
-  auto ptrTy = getPointerType(elementType, *addressSpace);
+  auto ptrTy = LLVM::LLVMPointerType::get(type.getContext(), *addressSpace);
 
   auto indexTy = getIndexType();
 
@@ -419,7 +407,7 @@ Type LLVMTypeConverter::convertMemRefType(MemRefType type) const {
 ///    be unranked.
 SmallVector<Type, 2>
 LLVMTypeConverter::getUnrankedMemRefDescriptorFields() const {
-  return {getIndexType(), getPointerType(IntegerType::get(&getContext(), 8))};
+  return {getIndexType(), LLVM::LLVMPointerType::get(&getContext())};
 }
 
 unsigned LLVMTypeConverter::getUnrankedMemRefDescriptorSize(
@@ -487,7 +475,7 @@ Type LLVMTypeConverter::convertMemRefToBarePtr(BaseMemRefType type) const {
   FailureOr<unsigned> addressSpace = getMemRefAddressSpace(type);
   if (failed(addressSpace))
     return {};
-  return getPointerType(elementType, *addressSpace);
+  return LLVM::LLVMPointerType::get(type.getContext(), *addressSpace);
 }
 
 /// Convert an n-D vector type to an LLVM vector type:
@@ -593,7 +581,7 @@ Value LLVMTypeConverter::promoteOneMemRefDescriptor(Location loc, Value operand,
                                                     OpBuilder &builder) const {
   // Alloca with proper alignment. We do not expect optimizations of this
   // alloca op and so we omit allocating at the entry block.
-  auto ptrType = getPointerType(operand.getType());
+  auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
   Value one = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
                                                builder.getIndexAttr(1));
   Value allocated =
diff --git a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
index 7e3fb9e95bc2cd9..b29abc94ce400c4 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
@@ -62,9 +62,7 @@ static Value castAllocFuncResult(ConversionPatternRewriter &rewriter,
   unsigned memrefAddrSpace = *maybeMemrefAddrSpace;
   if (allocatedPtrTy.getAddressSpace() != memrefAddrSpace)
     allocatedPtr = rewriter.create<LLVM::AddrSpaceCastOp>(
-        loc,
-        typeConverter.getPointerType(allocatedPtrTy.getElementType(),
-                                     memrefAddrSpace),
+        loc, LLVM::LLVMPointerType::get(rewriter.getContext(), memrefAddrSpace),
         allocatedPtr);
   return allocatedPtr;
 }
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
index cbc09c9ddfa3aa9..c62e676efc159a0 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
@@ -314,9 +314,8 @@ static unsigned mapToAddressSpace(spirv::ClientAPI clientAPI,
 static Type convertPointerType(spirv::PointerType type,
                                LLVMTypeConverter &converter,
                                spirv::ClientAPI clientAPI) {
-  auto pointeeType = converter.convertType(type.getPointeeType());
   unsigned addressSpace = mapToAddressSpace(clientAPI, type.getStorageClass());
-  return converter.getPointerType(pointeeType, addressSpace);
+  return LLVM::LLVMPointerType::get(type.getContext(), addressSpace);
 }
 
 /// Converts SPIR-V runtime array to LLVM array. Since LLVM allows indexing over
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index 7ed8296a22a4570..0004c2e3403e53a 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -93,25 +93,19 @@ LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF64Fn(ModuleOp moduleOp) {
                           LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-static LLVM::LLVMPointerType getCharPtr(MLIRContext *context,
-                                        bool opaquePointers) {
-  if (opaquePointers)
-    return LLVM::LLVMPointerType::get(context);
-
-  return LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
+static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
+  return LLVM::LLVMPointerType::get(context);
 }
 
-static LLVM::LLVMPointerType getVoidPtr(MLIRContext *context,
-                                        bool opaquePointers) {
+static LLVM::LLVMPointerType getVoidPtr(MLIRContext *context) {
   // A char pointer and void ptr are the same in LLVM IR.
-  return getCharPtr(context, opaquePointers);
+  return getCharPtr(context);
 }
 
 LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintStringFn(
-    ModuleOp moduleOp, bool opaquePointers,
-    std::optional<StringRef> runtimeFunctionName) {
+    ModuleOp moduleOp, std::optional<StringRef> runtimeFunctionName) {
   return lookupOrCreateFn(moduleOp, runtimeFunctionName.value_or(kPrintString),
-                          getCharPtr(moduleOp->getContext(), opaquePointers),
+                          getCharPtr(moduleOp->getContext()),
                           LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
@@ -136,48 +130,40 @@ LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintNewlineFn(ModuleOp moduleOp) {
 }
 
 LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateMallocFn(ModuleOp moduleOp,
-                                                    Type indexType,
-                                                    bool opaquePointers) {
-  return LLVM::lookupOrCreateFn(
-      moduleOp, kMalloc, indexType,
-      getVoidPtr(moduleOp->getContext(), opaquePointers));
+                                                    Type indexType) {
+  return LLVM::lookupOrCreateFn(moduleOp, kMalloc, indexType,
+                                getVoidPtr(moduleOp->getContext()));
 }
 
 LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateAlignedAllocFn(ModuleOp moduleOp,
-                                                          Type indexType,
-                                                          bool opaquePointers) {
-  return LLVM::lookupOrCreateFn(
-      moduleOp, kAlignedAlloc, {indexType, indexType},
-      getVoidPtr(moduleOp->getContext(), opaquePointers));
+                                                          Type indexType) {
+  return LLVM::lookupOrCreateFn(moduleOp, kAlignedAlloc, {indexType, indexType},
+                                getVoidPtr(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateFreeFn(ModuleOp moduleOp,
-                                                  bool opaquePointers) {
+LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateFreeFn(ModuleOp moduleOp) {
   return LLVM::lookupOrCreateFn(
-      moduleOp, kFree, getVoidPtr(moduleOp->getContext(), opaquePointers),
+      moduleOp, kFree, getVoidPtr(moduleOp->getContext()),
       LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
 LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericAllocFn(ModuleOp moduleOp,
-                                                          Type indexType,
-                                                          bool opaquePointers) {
-  return LLVM::lookupOrCreateFn(
-      moduleOp, kGenericAlloc, indexType,
-      getVoidPtr(moduleOp->getContext(), opaquePointers));
+                                                          Type indexType) {
+  return LLVM::lookupOrCreateFn(moduleOp, kGenericAlloc, indexType,
+                                getVoidPtr(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericAlignedAllocFn(
-    ModuleOp moduleOp, Type indexType, bool opaquePointers) {
-  return LLVM::lookupOrCreateFn(
-      moduleOp, kGenericAlignedAlloc, {indexType, indexType},
-      getVoidPtr(moduleOp->getContext(), opaquePointers));
+LLVM::LLVMFuncOp
+mlir::LLVM::lookupOrCreateGenericAlignedAllocFn(ModuleOp moduleOp,
+                                                Type indexType) {
+  return LLVM::lookupOrCreateFn(moduleOp, kGenericAlignedAlloc,
+                                {indexType, indexType},
+                                getVoidPtr(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericFreeFn(ModuleOp moduleOp,
-                                                         bool opaquePointers) {
+LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericFreeFn(ModuleOp moduleOp) {
   return LLVM::lookupOrCreateFn(
-      moduleOp, kGenericFree,
-      getVoidPtr(moduleOp->getContext(), opaquePointers),
+      moduleOp, kGenericFree, getVoidPtr(moduleOp->getContext()),
       LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index 408c1dc798feeb4..316ed17caf47dcd 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -99,8 +99,7 @@ void transform::ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns(
       });
   llvmTypeConverter.addConversion(
       [&](nvgpu::TensorMapDescriptorType type) -> Type {
-        return llvmTypeConverter.getPointerType(
-            type.getTensor().getElementType());
+        return LLVM::LLVMPointerType::get(type.getContext());
       });
   populateNVGPUToNVVMConversionPatterns(llvmTypeConverter, patterns);
 }

>From 51485019fb34a48dc6226bfa42d7449091e3f03d Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 1 Nov 2023 16:27:29 +0000
Subject: [PATCH 17/76] [NFC][LLVM][SVE] Refactor predicate register ASM
 constraint parsing to use std::optional.

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 291f0c8c5d991c6..94901c2d1a65688 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10163,14 +10163,15 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   return "r";
 }
 
-enum PredicateConstraint { Uph, Upl, Upa, Invalid };
+enum class PredicateConstraint { Uph, Upl, Upa };
 
-static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
-  return StringSwitch<PredicateConstraint>(Constraint)
+static std::optional<PredicateConstraint>
+parsePredicateConstraint(StringRef Constraint) {
+  return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
       .Case("Uph", PredicateConstraint::Uph)
       .Case("Upl", PredicateConstraint::Upl)
       .Case("Upa", PredicateConstraint::Upa)
-      .Default(PredicateConstraint::Invalid);
+      .Default(std::nullopt);
 }
 
 static const TargetRegisterClass *
@@ -10180,8 +10181,6 @@ getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
     return nullptr;
 
   switch (Constraint) {
-  default:
-    return nullptr;
   case PredicateConstraint::Uph:
     return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
                                      : &AArch64::PPR_p8to15RegClass;
@@ -10192,6 +10191,8 @@ getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
     return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
                                      : &AArch64::PPRRegClass;
   }
+
+  llvm_unreachable("Missing PredicateConstraint!");
 }
 
 // The set of cc code supported is from
@@ -10289,9 +10290,8 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
     case 'S': // A symbolic address
       return C_Other;
     }
-  } else if (parsePredicateConstraint(Constraint) !=
-             PredicateConstraint::Invalid)
-      return C_RegisterClass;
+  } else if (parsePredicateConstraint(Constraint))
+    return C_RegisterClass;
   else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
     return C_Other;
   return TargetLowering::getConstraintType(Constraint);
@@ -10325,7 +10325,7 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
     weight = CW_Constant;
     break;
   case 'U':
-    if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
+    if (parsePredicateConstraint(constraint))
       weight = CW_Register;
     break;
   }
@@ -10382,9 +10382,9 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
       break;
     }
   } else {
-    PredicateConstraint PC = parsePredicateConstraint(Constraint);
-    if (const TargetRegisterClass *RegClass = getPredicateRegisterClass(PC, VT))
-      return std::make_pair(0U, RegClass);
+    if (const auto PC = parsePredicateConstraint(Constraint))
+      if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
+        return std::make_pair(0U, RegClass);
   }
   if (StringRef("{cc}").equals_insensitive(Constraint) ||
       parseConstraintCode(Constraint) != AArch64CC::Invalid)

>From 236197a065879c900a450e5236adbb76578006cb Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 13:03:51 +0100
Subject: [PATCH 18/76] [ConstantFold] Simplify evaluateICmpRelation()
 implementation (NFCI)

Clarify that most of this only works on pointer icmps, and remove
the unnecessary isSigned parameter, as well as the ConstantInt
fallback.

Also perform complexity sorting upfront, so we don't need to deal
with swapped cases in the individual branches.
---
 llvm/lib/IR/ConstantFold.cpp | 106 +++++++++++++----------------------
 1 file changed, 38 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index a4df579406538a4..d4bc74e00f19bb9 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1142,57 +1142,51 @@ static ICmpInst::Predicate areGlobalsPotentiallyEqual(const GlobalValue *GV1,
 /// If we can determine that the two constants have a particular relation to
 /// each other, we should return the corresponding ICmp predicate, otherwise
 /// return ICmpInst::BAD_ICMP_PREDICATE.
-///
-/// To simplify this code we canonicalize the relation so that the first
-/// operand is always the most "complex" of the two.  We consider simple
-/// constants (like ConstantInt) to be the simplest, followed by
-/// GlobalValues, followed by ConstantExpr's (the most complex).
-///
-static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
-                                                bool isSigned) {
+static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2) {
   assert(V1->getType() == V2->getType() &&
          "Cannot compare different types of values!");
   if (V1 == V2) return ICmpInst::ICMP_EQ;
 
-  if (!isa<ConstantExpr>(V1) && !isa<GlobalValue>(V1) &&
-      !isa<BlockAddress>(V1)) {
-    if (!isa<GlobalValue>(V2) && !isa<ConstantExpr>(V2) &&
-        !isa<BlockAddress>(V2)) {
-      // We distilled this down to a simple case, use the standard constant
-      // folder.
-      ConstantInt *R = nullptr;
-      ICmpInst::Predicate pred = ICmpInst::ICMP_EQ;
-      R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
-      if (R && !R->isZero())
-        return pred;
-      pred = isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
-      R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
-      if (R && !R->isZero())
-        return pred;
-      pred = isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
-      R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
-      if (R && !R->isZero())
-        return pred;
-
-      // If we couldn't figure it out, bail.
-      return ICmpInst::BAD_ICMP_PREDICATE;
-    }
-
-    // If the first operand is simple, swap operands.
-    ICmpInst::Predicate SwappedRelation =
-      evaluateICmpRelation(V2, V1, isSigned);
+  // The following folds only apply to pointers.
+  if (!V1->getType()->isPointerTy())
+    return ICmpInst::BAD_ICMP_PREDICATE;
+
+  // To simplify this code we canonicalize the relation so that the first
+  // operand is always the most "complex" of the two.  We consider simple
+  // constants (like ConstantPointerNull) to be the simplest, followed by
+  // BlockAddress, GlobalValues, and ConstantExpr's (the most complex).
+  auto GetComplexity = [](Constant *V) {
+    if (isa<ConstantExpr>(V))
+      return 3;
+    if (isa<GlobalValue>(V))
+      return 2;
+    if (isa<BlockAddress>(V))
+      return 1;
+    return 0;
+  };
+  if (GetComplexity(V1) < GetComplexity(V2)) {
+    ICmpInst::Predicate SwappedRelation = evaluateICmpRelation(V2, V1);
     if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
       return ICmpInst::getSwappedPredicate(SwappedRelation);
+    return ICmpInst::BAD_ICMP_PREDICATE;
+  }
 
-  } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(V1)) {
-    if (isa<ConstantExpr>(V2)) {  // Swap as necessary.
-      ICmpInst::Predicate SwappedRelation =
-        evaluateICmpRelation(V2, V1, isSigned);
-      if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
-        return ICmpInst::getSwappedPredicate(SwappedRelation);
-      return ICmpInst::BAD_ICMP_PREDICATE;
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(V1)) {
+    // Now we know that the RHS is a BlockAddress or simple
+    // constant (which, since the types must match, means that it is a
+    // ConstantPointerNull).
+    if (const BlockAddress *BA2 = dyn_cast<BlockAddress>(V2)) {
+      // Block address in another function can't equal this one, but block
+      // addresses in the current function might be the same if blocks are
+      // empty.
+      if (BA2->getFunction() != BA->getFunction())
+        return ICmpInst::ICMP_NE;
+    } else {
+      // Block addresses aren't null.
+      assert(isa<ConstantPointerNull>(V2) && "Canonicalization guarantee!");
+      return ICmpInst::ICMP_NE;
     }
-
+  } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(V1)) {
     // Now we know that the RHS is a GlobalValue, BlockAddress or simple
     // constant (which, since the types must match, means that it's a
     // ConstantPointerNull).
@@ -1212,30 +1206,6 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
                                 GV->getType()->getAddressSpace()))
         return ICmpInst::ICMP_UGT;
     }
-  } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(V1)) {
-    if (isa<ConstantExpr>(V2)) {  // Swap as necessary.
-      ICmpInst::Predicate SwappedRelation =
-        evaluateICmpRelation(V2, V1, isSigned);
-      if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
-        return ICmpInst::getSwappedPredicate(SwappedRelation);
-      return ICmpInst::BAD_ICMP_PREDICATE;
-    }
-
-    // Now we know that the RHS is a GlobalValue, BlockAddress or simple
-    // constant (which, since the types must match, means that it is a
-    // ConstantPointerNull).
-    if (const BlockAddress *BA2 = dyn_cast<BlockAddress>(V2)) {
-      // Block address in another function can't equal this one, but block
-      // addresses in the current function might be the same if blocks are
-      // empty.
-      if (BA2->getFunction() != BA->getFunction())
-        return ICmpInst::ICMP_NE;
-    } else {
-      // Block addresses aren't null, don't equal the address of globals.
-      assert((isa<ConstantPointerNull>(V2) || isa<GlobalValue>(V2)) &&
-             "Canonicalization guarantee!");
-      return ICmpInst::ICMP_NE;
-    }
   } else {
     // Ok, the LHS is known to be a constantexpr.  The RHS can be any of a
     // constantexpr, a global, block address, or a simple constant.
@@ -1429,7 +1399,7 @@ Constant *llvm::ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
   } else {
     // Evaluate the relation between the two constants, per the predicate.
     int Result = -1;  // -1 = unknown, 0 = known false, 1 = known true.
-    switch (evaluateICmpRelation(C1, C2, CmpInst::isSigned(Predicate))) {
+    switch (evaluateICmpRelation(C1, C2)) {
     default: llvm_unreachable("Unknown relational!");
     case ICmpInst::BAD_ICMP_PREDICATE:
       break;  // Couldn't determine anything about these constants.

>From 05a47706476f3cd467aa1c4347fdfb71bcbc1252 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 13:09:17 +0100
Subject: [PATCH 19/76] [ConstantFold] Fix incorrect type assumptions

If a pointer isn't a constant expression, global or block address,
it's not guaranteed to be a null pointer. It can also be a no_cfi
or dso_local_equivalent constant.
---
 llvm/lib/IR/ConstantFold.cpp                  | 14 ++++-------
 .../InstSimplify/ConstProp/icmp-global.ll     | 23 +++++++++++++++++++
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index d4bc74e00f19bb9..3f5da236af211f6 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1172,30 +1172,24 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2) {
   }
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(V1)) {
-    // Now we know that the RHS is a BlockAddress or simple
-    // constant (which, since the types must match, means that it is a
-    // ConstantPointerNull).
+    // Now we know that the RHS is a BlockAddress or simple constant.
     if (const BlockAddress *BA2 = dyn_cast<BlockAddress>(V2)) {
       // Block address in another function can't equal this one, but block
       // addresses in the current function might be the same if blocks are
       // empty.
       if (BA2->getFunction() != BA->getFunction())
         return ICmpInst::ICMP_NE;
-    } else {
-      // Block addresses aren't null.
-      assert(isa<ConstantPointerNull>(V2) && "Canonicalization guarantee!");
+    } else if (isa<ConstantPointerNull>(V2)) {
       return ICmpInst::ICMP_NE;
     }
   } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(V1)) {
     // Now we know that the RHS is a GlobalValue, BlockAddress or simple
-    // constant (which, since the types must match, means that it's a
-    // ConstantPointerNull).
+    // constant.
     if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2)) {
       return areGlobalsPotentiallyEqual(GV, GV2);
     } else if (isa<BlockAddress>(V2)) {
       return ICmpInst::ICMP_NE; // Globals never equal labels.
-    } else {
-      assert(isa<ConstantPointerNull>(V2) && "Canonicalization guarantee!");
+    } else if (isa<ConstantPointerNull>(V2)) {
       // GlobalVals can never be null unless they have external weak linkage.
       // We don't try to evaluate aliases here.
       // NOTE: We should not be doing this constant folding if null pointer
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
index 701d911ea892ac4..3851bd090aef9f4 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
@@ -275,3 +275,26 @@ define i1 @global_gep_ugt_global_gep_complex() {
   %cmp = icmp ugt ptr %gep3, @g
   ret i1 %cmp
 }
+
+declare void @func()
+
+define i1 @global_no_cfi() {
+; CHECK-LABEL: @global_no_cfi(
+; CHECK-NEXT:    ret i1 icmp eq (ptr @func, ptr no_cfi @func)
+;
+  %cmp = icmp eq ptr @func, no_cfi @func
+  ret i1 %cmp
+}
+
+define i1 @blockaddr_no_cfi() {
+; CHECK-LABEL: @blockaddr_no_cfi(
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret i1 icmp eq (ptr blockaddress(@blockaddr_no_cfi, [[BB]]), ptr no_cfi @func)
+;
+  br label %bb
+
+bb:
+  %cmp = icmp eq ptr blockaddress(@blockaddr_no_cfi, %bb), no_cfi @func
+  ret i1 %cmp
+}

>From 17970df6dca3cf46f0264e94f40eb4ef93ef521f Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 3 Nov 2023 13:23:02 +0000
Subject: [PATCH 20/76] [LLVM][SVE] Move ADDVL isel patterns under
 UseScalarIncVL feature flag. (#71173)

Also removes a duplicate pattern.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 19 +++++++----------
 ...plex-deinterleaving-reductions-scalable.ll |  7 ++++---
 .../AArch64/named-vector-shuffles-sve.ll      | 21 ++++++++++---------
 .../AArch64/sve-extract-fixed-vector.ll       |  4 ++--
 llvm/test/CodeGen/AArch64/sve-gep.ll          |  6 ++++--
 .../CodeGen/AArch64/sve-insert-element.ll     |  4 ++--
 .../test/CodeGen/AArch64/sve-insert-vector.ll |  4 ++--
 .../AArch64/sve-intrinsics-loads-nf.ll        |  6 ++++--
 ...contiguous-ldst-addressing-mode-reg-imm.ll |  6 ++++--
 ...n-temporal-ldst-addressing-mode-reg-imm.ll |  6 ++++--
 .../CodeGen/AArch64/sve-split-extract-elt.ll  | 16 +++++++-------
 .../CodeGen/AArch64/sve-split-insert-elt.ll   |  8 +++----
 llvm/test/CodeGen/AArch64/sve-vl-arith.ll     | 16 +++++++-------
 13 files changed, 65 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index fc6a6a88b4fc084..a53973bad92e25f 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2423,14 +2423,6 @@ let Predicates = [HasSVEorSME] in {
   }
 
   let AddedComplexity = 5 in {
-    def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
-              (ADDVL_XXI GPR64:$op, $imm)>;
-
-    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                             GPR32:$op, sub_32), $imm),
-                                   sub_32))>;
-
     def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (splat_vector (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
               (INCH_ZPiI ZPR:$op, 31, $imm)>;
     def : Pat<(nxv4i32 (add ZPR:$op, (nxv4i32 (splat_vector (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))),
@@ -2447,6 +2439,14 @@ let Predicates = [HasSVEorSME] in {
   }
 
   let Predicates = [HasSVEorSME, UseScalarIncVL], AddedComplexity = 5 in {
+    def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
+              (ADDVL_XXI GPR64:$op, $imm)>;
+
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
+              (i32 (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                                             GPR32:$op, sub_32), $imm),
+                                   sub_32))>;
+
     def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
               (INCH_XPiI GPR64:$op, 31, $imm)>;
     def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))),
@@ -2488,9 +2488,6 @@ let Predicates = [HasSVEorSME] in {
                                     sub_32))>;
   }
 
-  def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
-            (ADDVL_XXI GPR64:$op, $imm)>;
-
   // FIXME: BigEndian requires an additional REV instruction to satisfy the
   // constraint that none of the bits change when stored to memory as one
   // type, and reloaded as another type.
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index aefacc605474fa7..bb58248c6f60e09 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -195,13 +195,14 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    neg x10, x9
 ; CHECK-NEXT:    mov w11, #1000 // =0x3e8
+; CHECK-NEXT:    rdvl x13, #2
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    and x10, x10, x11
-; CHECK-NEXT:    rdvl x11, #4
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT:    addvl x12, x1, #2
-; CHECK-NEXT:    addvl x13, x0, #2
+; CHECK-NEXT:    rdvl x11, #4
+; CHECK-NEXT:    add x12, x1, x13
+; CHECK-NEXT:    add x13, x0, x13
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index f1e95ca9c206f01..06570b4539cc111 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -351,9 +351,9 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-8
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    mov w9, #16 // =0x10
-; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #16
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
@@ -457,7 +457,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    mov w9, #17 // =0x11
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #17
-; CHECK-NEXT:    addvl x10, x10, #1
+; CHECK-NEXT:    add x10, x10, x8
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    sub x8, x10, x8
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
@@ -502,7 +502,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov w9, #18 // =0x12
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #18
-; CHECK-NEXT:    addvl x10, x10, #1
+; CHECK-NEXT:    add x10, x10, x8
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    sub x8, x10, x8
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
@@ -613,7 +613,7 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale
 ; CHECK-NEXT:    mov w9, #18 // =0x12
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #18
-; CHECK-NEXT:    addvl x10, x10, #1
+; CHECK-NEXT:    add x10, x10, x8
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    sub x8, x10, x8
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
@@ -779,9 +779,10 @@ define <vscale x 8 x i32> @splice_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    rdvl x8, #2
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    add x8, x9, x8
 ; CHECK-NEXT:    mov x9, #-8 // =0xfffffffffffffff8
-; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    sub x10, x8, #32
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
@@ -807,9 +808,9 @@ define <vscale x 16 x float> @splice_nxv16f32_neg17(<vscale x 16 x float> %a, <v
 ; CHECK-NEXT:    mov w9, #68 // =0x44
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #68
-; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    addvl x9, x10, #4
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    csel x9, x8, x9, lo
+; CHECK-NEXT:    add x8, x10, x8
+; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index 0dd7320413a147f..d2cbbe0628f0f18 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -215,9 +215,9 @@ define <16 x i8> @extract_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec) nounwind
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov x8, #-16 // =0xfffffffffffffff0
+; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    mov w9, #16 // =0x10
-; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    sub x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #16
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
diff --git a/llvm/test/CodeGen/AArch64/sve-gep.ll b/llvm/test/CodeGen/AArch64/sve-gep.ll
index fd93e43613c52c2..8ebd0198e9099ae 100644
--- a/llvm/test/CodeGen/AArch64/sve-gep.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gep.ll
@@ -4,7 +4,8 @@
 define <vscale x 2 x i64>* @scalar_of_scalable_1(<vscale x 2 x i64>* %base) {
 ; CHECK-LABEL: scalar_of_scalable_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    addvl x0, x0, #4
+; CHECK-NEXT:    rdvl x8, #4
+; CHECK-NEXT:    add x0, x0, x8
 ; CHECK-NEXT:    ret
   %d = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 4
   ret <vscale x 2 x i64>* %d
@@ -202,7 +203,8 @@ define <vscale x 2 x i64*> @scalable_of_fixed_5_i64(i64* %base, <vscale x 2 x i3
 define <vscale x 2 x <vscale x 2 x i64>*> @scalable_of_scalable_1(<vscale x 2 x i64>* %base) {
 ; CHECK-LABEL: scalable_of_scalable_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    addvl x8, x0, #1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    add x8, x0, x8
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    ret
   %idx = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 1, i32 0), <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
index f327e32c92e0068..2aa298f6d9173fe 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@@ -590,10 +590,10 @@ define <vscale x 32 x i1> @test_predicate_insert_32xi1(<vscale x 32 x i1> %val,
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    ptrue p2.b
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    rdvl x8, #2
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    mov w9, w1
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    csel x8, x9, x8, lo
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index 9ca928c00299fe4..de2efe288ac4707 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -120,9 +120,9 @@ define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec, <
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov x8, #-16 // =0xfffffffffffffff0
+; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    mov w9, #16 // =0x10
-; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    sub x8, x8, #16
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #16
 ; CHECK-NEXT:    csel x8, x8, x9, lo
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll
index a774af20f2e93c7..f86d999340184eb 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll
@@ -17,7 +17,8 @@ define <vscale x 16 x i8> @ldnf1b(<vscale x 16 x i1> %pg, ptr %a) {
 define <vscale x 16 x i8> @ldnf1b_out_of_lower_bound(<vscale x 16 x i1> %pg, ptr %a) {
 ; CHECK-LABEL: ldnf1b_out_of_lower_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    addvl x8, x0, #-9
+; CHECK-NEXT:    rdvl x8, #-9
+; CHECK-NEXT:    add x8, x0, x8
 ; CHECK-NEXT:    ldnf1b { z0.b }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 -9
@@ -62,7 +63,8 @@ define <vscale x 16 x i8> @ldnf1b_upper_bound(<vscale x 16 x i1> %pg, ptr %a) {
 define <vscale x 16 x i8> @ldnf1b_out_of_upper_bound(<vscale x 16 x i1> %pg, ptr %a) {
 ; CHECK-LABEL: ldnf1b_out_of_upper_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    addvl x8, x0, #8
+; CHECK-NEXT:    rdvl x8, #8
+; CHECK-NEXT:    add x8, x0, x8
 ; CHECK-NEXT:    ldnf1b { z0.b }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 8
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll
index c85779b99b048aa..c5a3945cd047487 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll
@@ -9,9 +9,11 @@
 define void @imm_out_of_range(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
 ; CHECK-LABEL: imm_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    addvl x8, x0, #8
+; CHECK-NEXT:    rdvl x8, #8
+; CHECK-NEXT:    add x8, x0, x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    addvl x8, x0, #-9
+; CHECK-NEXT:    rdvl x8, #-9
+; CHECK-NEXT:    add x8, x0, x8
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
 ; CHECK-NEXT:    ret
   %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 8
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll
index 73f23c3952ccc16..c06921ee96a0d4d 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll
@@ -9,9 +9,11 @@
 define void @imm_out_of_range(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
 ; CHECK-LABEL: imm_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    addvl x8, x0, #8
+; CHECK-NEXT:    rdvl x8, #8
+; CHECK-NEXT:    add x8, x0, x8
 ; CHECK-NEXT:    ldnt1d { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    addvl x8, x0, #-9
+; CHECK-NEXT:    rdvl x8, #-9
+; CHECK-NEXT:    add x8, x0, x8
 ; CHECK-NEXT:    stnt1d { z0.d }, p0, [x8]
 ; CHECK-NEXT:    ret
   %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 8
diff --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
index d79990e9e9616ec..a1c2ec9c7e1d42f 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
@@ -23,9 +23,9 @@ define i8 @split_extract_32i8_idx(<vscale x 32 x i8> %a, i32 %idx) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    rdvl x8, #2
 ; CHECK-NEXT:    mov w9, w0
-; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    mov x9, sp
@@ -47,9 +47,9 @@ define i16 @split_extract_16i16_idx(<vscale x 16 x i16> %a, i32 %idx) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    mov w9, w0
-; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    mov x9, sp
@@ -141,9 +141,9 @@ define i16 @split_extract_16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    mov w9, #128 // =0x80
-; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
@@ -165,10 +165,10 @@ define i32 @split_extract_16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    mov w9, #34464 // =0x86a0
 ; CHECK-NEXT:    movk w9, #1, lsl #16
-; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
diff --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
index 7984057241c8473..5441659fa5cb452 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
@@ -24,9 +24,9 @@ define <vscale x 32 x i8> @split_insert_32i8_idx(<vscale x 32 x i8> %a, i8 %elt,
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    rdvl x8, #2
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x1, x8
 ; CHECK-NEXT:    csel x8, x1, x8, lo
 ; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
@@ -136,9 +136,9 @@ define <vscale x 32 x i16> @split_insert_32i16(<vscale x 32 x i16> %a, i16 %elt)
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    rdvl x8, #2
 ; CHECK-NEXT:    mov w9, #128 // =0x80
-; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
index cd11972efc8011e..dd4294c8d3bdcc2 100644
--- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
@@ -116,7 +116,8 @@ define <vscale x 2 x i64> @decd_vec(<vscale x 2 x i64> %a) {
 define i64 @incb_scalar_i64(i64 %a) {
 ; NO_SCALAR_INC-LABEL: incb_scalar_i64:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    addvl x0, x0, #1
+; NO_SCALAR_INC-NEXT:    rdvl x8, #1
+; NO_SCALAR_INC-NEXT:    add x0, x0, x8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: incb_scalar_i64:
@@ -185,7 +186,8 @@ define i64 @incd_scalar_i64(i64 %a) {
 define i64 @decb_scalar_i64(i64 %a) {
 ; NO_SCALAR_INC-LABEL: decb_scalar_i64:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    addvl x0, x0, #-2
+; NO_SCALAR_INC-NEXT:    rdvl x8, #-2
+; NO_SCALAR_INC-NEXT:    add x0, x0, x8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: decb_scalar_i64:
@@ -257,9 +259,8 @@ define i64 @decd_scalar_i64(i64 %a) {
 define i32 @incb_scalar_i32(i32 %a) {
 ; NO_SCALAR_INC-LABEL: incb_scalar_i32:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    // kill: def $w0 killed $w0 def $x0
-; NO_SCALAR_INC-NEXT:    addvl x0, x0, #3
-; NO_SCALAR_INC-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NO_SCALAR_INC-NEXT:    rdvl x8, #3
+; NO_SCALAR_INC-NEXT:    add w0, w0, w8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: incb_scalar_i32:
@@ -344,9 +345,8 @@ define i32 @incd_scalar_i32(i32 %a) {
 define i32 @decb_scalar_i32(i32 %a) {
 ; NO_SCALAR_INC-LABEL: decb_scalar_i32:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    // kill: def $w0 killed $w0 def $x0
-; NO_SCALAR_INC-NEXT:    addvl x0, x0, #-4
-; NO_SCALAR_INC-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NO_SCALAR_INC-NEXT:    rdvl x8, #-4
+; NO_SCALAR_INC-NEXT:    add w0, w0, w8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: decb_scalar_i32:

>From 9915ebaa5e596358b9b8b6753723b613b780b09c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 14:21:16 +0100
Subject: [PATCH 21/76] [ConstantFold] Remove redundant
 constantFoldCompareGlobalToNull() fold (NFCI)

This is already handled in evaluateICmpRelation().
---
 llvm/lib/IR/ConstantFold.cpp | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 3f5da236af211f6..3028668ced62931 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1252,25 +1252,6 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2) {
   return ICmpInst::BAD_ICMP_PREDICATE;
 }
 
-static Constant *constantFoldCompareGlobalToNull(CmpInst::Predicate Predicate,
-                                                 Constant *C1, Constant *C2) {
-  const GlobalValue *GV = dyn_cast<GlobalValue>(C2);
-  if (!GV || !C1->isNullValue())
-    return nullptr;
-
-  // Don't try to evaluate aliases.  External weak GV can be null.
-  if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage() &&
-      !NullPointerIsDefined(nullptr /* F */,
-                            GV->getType()->getAddressSpace())) {
-    if (Predicate == ICmpInst::ICMP_EQ)
-      return ConstantInt::getFalse(C1->getContext());
-    else if (Predicate == ICmpInst::ICMP_NE)
-      return ConstantInt::getTrue(C1->getContext());
-  }
-
-  return nullptr;
-}
-
 Constant *llvm::ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
                                                Constant *C1, Constant *C2) {
   Type *ResultTy;
@@ -1309,14 +1290,6 @@ Constant *llvm::ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
     return ConstantInt::get(ResultTy, CmpInst::isUnordered(Predicate));
   }
 
-  // icmp eq/ne(null,GV) -> false/true
-  if (Constant *Folded = constantFoldCompareGlobalToNull(Predicate, C1, C2))
-    return Folded;
-
-  // icmp eq/ne(GV,null) -> false/true
-  if (Constant *Folded = constantFoldCompareGlobalToNull(Predicate, C2, C1))
-    return Folded;
-
   if (C2->isNullValue()) {
     // The caller is expected to commute the operands if the constant expression
     // is C2.

>From e299a4287018769b9dfdd1eea8609bb59423ae90 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 14:30:16 +0100
Subject: [PATCH 22/76] [ConstantFold] Remove redundant bitcast icmp handling
 (NFCI)

This code excludes vector/scalar and FP bitcasts. All other
integer/pointer bitcasts are no-op bitcasts and will be removed
entirely.
---
 llvm/lib/IR/ConstantFold.cpp | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 3028668ced62931..f877fb614c65da8 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1451,20 +1451,6 @@ Constant *llvm::ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
     if (Result != -1)
       return ConstantInt::get(ResultTy, Result);
 
-    // If the right hand side is a bitcast, try using its inverse to simplify
-    // it by moving it to the left hand side.  We can't do this if it would turn
-    // a vector compare into a scalar compare or visa versa, or if it would turn
-    // the operands into FP values.
-    if (ConstantExpr *CE2 = dyn_cast<ConstantExpr>(C2)) {
-      Constant *CE2Op0 = CE2->getOperand(0);
-      if (CE2->getOpcode() == Instruction::BitCast &&
-          CE2->getType()->isVectorTy() == CE2Op0->getType()->isVectorTy() &&
-          !CE2Op0->getType()->isFPOrFPVectorTy()) {
-        Constant *Inverse = ConstantExpr::getBitCast(C1, CE2Op0->getType());
-        return ConstantExpr::getICmp(Predicate, Inverse, CE2Op0);
-      }
-    }
-
     if ((!isa<ConstantExpr>(C1) && isa<ConstantExpr>(C2)) ||
         (C1->isNullValue() && !C2->isNullValue())) {
       // If C2 is a constant expr and C1 isn't, flip them around and fold the

>From 8efaf7a518fa3cf719cdeff3917b9fedf8f2cf19 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 14:33:01 +0100
Subject: [PATCH 23/76] [ConstantFold] Remove redundant zero index gep fold
 (NFCI)

We already handle the more general case of zero index GEPs above,
so we don't need to also handle GEPs with null based and zero
indices. (Strictly speaking, this code could handle the special
case of an inrange gep with null base and zero indices, but that
has no practical relevance.)
---
 llvm/lib/IR/ConstantFold.cpp | 33 ---------------------------------
 1 file changed, 33 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index f877fb614c65da8..0358f996f285165 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1617,39 +1617,6 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
                      cast<VectorType>(GEPTy)->getElementCount(), C)
                : C;
 
-  if (C->isNullValue()) {
-    bool isNull = true;
-    for (Value *Idx : Idxs)
-      if (!isa<UndefValue>(Idx) && !cast<Constant>(Idx)->isNullValue()) {
-        isNull = false;
-        break;
-      }
-    if (isNull) {
-      PointerType *PtrTy = cast<PointerType>(C->getType()->getScalarType());
-      Type *Ty = GetElementPtrInst::getIndexedType(PointeeTy, Idxs);
-
-      assert(Ty && "Invalid indices for GEP!");
-      Type *OrigGEPTy = PointerType::get(Ty, PtrTy->getAddressSpace());
-      Type *GEPTy = PointerType::get(Ty, PtrTy->getAddressSpace());
-      if (VectorType *VT = dyn_cast<VectorType>(C->getType()))
-        GEPTy = VectorType::get(OrigGEPTy, VT->getElementCount());
-
-      // The GEP returns a vector of pointers when one of more of
-      // its arguments is a vector.
-      for (Value *Idx : Idxs) {
-        if (auto *VT = dyn_cast<VectorType>(Idx->getType())) {
-          assert((!isa<VectorType>(GEPTy) || isa<ScalableVectorType>(GEPTy) ==
-                                                 isa<ScalableVectorType>(VT)) &&
-                 "Mismatched GEPTy vector types");
-          GEPTy = VectorType::get(OrigGEPTy, VT->getElementCount());
-          break;
-        }
-      }
-
-      return Constant::getNullValue(GEPTy);
-    }
-  }
-
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (auto *GEP = dyn_cast<GEPOperator>(CE))
       if (Constant *C = foldGEPOfGEP(GEP, PointeeTy, InBounds, Idxs))

>From ab6bd9436adaf683abf790a49060560fe3eac3a3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 3 Nov 2023 13:38:38 +0000
Subject: [PATCH 24/76] [ConstraintElim] Add tests for additional SGT->UGT
 transfer.

Test cases inspired by
https://github.com/llvm/llvm-project/issues/63126.
---
 .../transfer-signed-facts-to-unsigned.ll      | 198 ++++++++++++++++++
 .../PhaseOrdering/loop-access-checks.ll       |  97 ++++++++-
 2 files changed, 289 insertions(+), 6 deletions(-)

diff --git a/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll b/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll
index 2fe92628dfa3b64..41596dbf3e3365e 100644
--- a/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll
+++ b/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll
@@ -730,3 +730,201 @@ entry:
   %res.2 = xor i1 %res.1, %c.1
   ret i1 %res.2
 }
+
+declare void @use(i1)
+
+define i8 @iv_known_non_negative_constant_trip_count(ptr %dst, i8 %N) {
+; CHECK-LABEL: @iv_known_non_negative_constant_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[IV]], 2
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[T_1:%.*]] = icmp ugt i8 [[N:%.*]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[T_1]])
+; CHECK-NEXT:    [[T_2:%.*]] = icmp sgt i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[T_2]])
+; CHECK-NEXT:    [[F_1:%.*]] = icmp ule i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[F_1]])
+; CHECK-NEXT:    [[F_2:%.*]] = icmp sle i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[F_2]])
+; CHECK-NEXT:    [[C_0:%.*]] = icmp ugt i8 [[IV]], 2
+; CHECK-NEXT:    call void @use(i1 [[C_0]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i8 [[IV]], 1
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    ret i8 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %cmp = icmp slt i8 %iv, 2
+  br i1 %cmp, label %loop.latch, label %exit.1
+
+loop.latch:
+  %t.1 = icmp ugt i8 %N, %iv
+  call void @use(i1 %t.1)
+  %t.2 = icmp sgt i8 %N, %iv
+  call void @use(i1 %t.2)
+  %f.1 = icmp ule i8 %N, %iv
+  call void @use(i1 %f.1)
+  %f.2 = icmp sle i8 %N, %iv
+  call void @use(i1 %f.2)
+  %c.0 = icmp ugt i8 %iv, 2
+  call void @use(i1 %c.0)
+  %iv.next = add nuw nsw i8 %iv, 1
+  br label %loop.header
+
+exit.1:
+  ret i8 10
+}
+
+define i8 @iv_known_non_negative_constant_trip_count_no_nsw_flag(ptr %dst, i8 %N) {
+; CHECK-LABEL: @iv_known_non_negative_constant_trip_count_no_nsw_flag(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[IV]], 2
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[T_1:%.*]] = icmp ugt i8 [[N:%.*]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[T_1]])
+; CHECK-NEXT:    [[T_2:%.*]] = icmp sgt i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[T_2]])
+; CHECK-NEXT:    [[F_1:%.*]] = icmp ule i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[F_1]])
+; CHECK-NEXT:    [[F_2:%.*]] = icmp sle i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[F_2]])
+; CHECK-NEXT:    [[C_0:%.*]] = icmp ugt i8 [[IV]], 2
+; CHECK-NEXT:    call void @use(i1 [[C_0]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    ret i8 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %cmp = icmp slt i8 %iv, 2
+  br i1 %cmp, label %loop.latch, label %exit.1
+
+loop.latch:
+  %t.1 = icmp ugt i8 %N, %iv
+  call void @use(i1 %t.1)
+  %t.2 = icmp sgt i8 %N, %iv
+  call void @use(i1 %t.2)
+  %f.1 = icmp ule i8 %N, %iv
+  call void @use(i1 %f.1)
+  %f.2 = icmp sle i8 %N, %iv
+  call void @use(i1 %f.2)
+  %c.0 = icmp ugt i8 %iv, 2
+  call void @use(i1 %c.0)
+  %iv.next = add nuw i8 %iv, 1
+  br label %loop.header
+
+exit.1:
+  ret i8 10
+}
+
+define i8 @iv_known_non_negative_variable_trip_count(ptr %dst, i8 %N) {
+; CHECK-LABEL: @iv_known_non_negative_variable_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[IV]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[T_1:%.*]] = icmp ugt i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[T_1]])
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[F_1:%.*]] = icmp ule i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[F_1]])
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[C_0:%.*]] = icmp ugt i8 [[IV]], 2
+; CHECK-NEXT:    call void @use(i1 [[C_0]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i8 [[IV]], 1
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    ret i8 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %cmp = icmp slt i8 %iv, %N
+  br i1 %cmp, label %loop.latch, label %exit.1
+
+loop.latch:
+  %t.1 = icmp ugt i8 %N, %iv
+  call void @use(i1 %t.1)
+  %t.2 = icmp sgt i8 %N, %iv
+  call void @use(i1 %t.2)
+  %f.1 = icmp ule i8 %N, %iv
+  call void @use(i1 %f.1)
+  %f.2 = icmp sle i8 %N, %iv
+  call void @use(i1 %f.2)
+  %c.0 = icmp ugt i8 %iv, 2
+  call void @use(i1 %c.0)
+  %iv.next = add nuw nsw i8 %iv, 1
+  br label %loop.header
+
+exit.1:
+  ret i8 10
+}
+
+define i8 @iv_may_signed_wrap_variable_trip_count(ptr %dst, i8 %N) {
+; CHECK-LABEL: @iv_may_signed_wrap_variable_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[IV]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[T_1:%.*]] = icmp ugt i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[T_1]])
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[F_1:%.*]] = icmp ule i8 [[N]], [[IV]]
+; CHECK-NEXT:    call void @use(i1 [[F_1]])
+; CHECK-NEXT:    call void @use(i1 false)
+; CHECK-NEXT:    [[C_0:%.*]] = icmp ugt i8 [[IV]], 2
+; CHECK-NEXT:    call void @use(i1 [[C_0]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       exit.1:
+; CHECK-NEXT:    ret i8 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %cmp = icmp slt i8 %iv, %N
+  br i1 %cmp, label %loop.latch, label %exit.1
+
+loop.latch:
+  %t.1 = icmp ugt i8 %N, %iv
+  call void @use(i1 %t.1)
+  %t.2 = icmp sgt i8 %N, %iv
+  call void @use(i1 %t.2)
+  %f.1 = icmp ule i8 %N, %iv
+  call void @use(i1 %f.1)
+  %f.2 = icmp sle i8 %N, %iv
+  call void @use(i1 %f.2)
+  %c.0 = icmp ugt i8 %iv, 2
+  call void @use(i1 %c.0)
+  %iv.next = add nuw i8 %iv, 1
+  br label %loop.header
+
+exit.1:
+  ret i8 10
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll b/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll
index fcf1afce80ec081..e1eac5f80485494 100644
--- a/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll
@@ -134,8 +134,8 @@ define void @foo(ptr noundef nonnull align 8 dereferenceable(24) noalias %vec) #
 ; CHECK-SAME: (ptr noalias nocapture noundef nonnull readonly align 8 dereferenceable(24) [[VEC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[_M_FINISH_I_I:%.*]] = getelementptr inbounds [[VECTOR_IMPL_DATA:%.*]], ptr [[VEC]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_FINISH_I_I]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_FINISH_I_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC]], align 8, !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[SUB_PTR_LHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP0]] to i64
 ; CHECK-NEXT:    [[SUB_PTR_RHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    [[SUB_PTR_SUB_I_I:%.*]] = sub i64 [[SUB_PTR_LHS_CAST_I_I]], [[SUB_PTR_RHS_CAST_I_I]]
@@ -237,8 +237,8 @@ do.cond:
 do.end:
   %_M_impl = getelementptr inbounds %Vector_base, ptr %this1, i32 0, i32 0
   %_M_start = getelementptr inbounds %Vector_impl_data, ptr %_M_impl, i32 0, i32 0
-  %1 = load ptr, ptr %_M_start, align 8
-  %2 = load i64, ptr %__n.addr, align 8
+  %1 = load ptr, ptr %_M_start, align 8, !tbaa !0
+  %2 = load i64, ptr %__n.addr, align 8, !tbaa !5
   %add.ptr = getelementptr inbounds double, ptr %1, i64 %2
   ret ptr %add.ptr
 }
@@ -250,10 +250,10 @@ entry:
   %this1 = load ptr, ptr %this.addr, align 8
   %_M_impl = getelementptr inbounds %Vector_base, ptr %this1, i32 0, i32 0
   %_M_finish = getelementptr inbounds %Vector_impl_data, ptr %_M_impl, i32 0, i32 1
-  %0 = load ptr, ptr %_M_finish, align 8
+  %0 = load ptr, ptr %_M_finish, align 8, !tbaa !7
   %_M_impl2 = getelementptr inbounds %Vector_base, ptr %this1, i32 0, i32 0
   %_M_start = getelementptr inbounds %Vector_impl_data, ptr %_M_impl2, i32 0, i32 0
-  %1 = load ptr, ptr %_M_start, align 8
+  %1 = load ptr, ptr %_M_start, align 8, !tbaa !0
   %sub.ptr.lhs.cast = ptrtoint ptr %0 to i64
   %sub.ptr.rhs.cast = ptrtoint ptr %1 to i64
   %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
@@ -263,6 +263,79 @@ entry:
 
 declare void @abort()
 
+; -------------------------------------------------------------------------
+; Test case for runtime check removal when accessing vector elements with
+; hardened glibc++ and a signed induction variable.
+; https://github.com/llvm/llvm-project/issues/63126
+
+define void @loop_with_signed_induction(ptr noundef nonnull align 8 dereferenceable(24) %vec) {
+; CHECK-LABEL: define void @loop_with_signed_induction
+; CHECK-SAME: (ptr nocapture noundef nonnull readonly align 8 dereferenceable(24) [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[_M_FINISH_I_I:%.*]] = getelementptr inbounds [[VECTOR_IMPL_DATA:%.*]], ptr [[VEC]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_FINISH_I_I]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB_I_I:%.*]] = sub i64 [[SUB_PTR_LHS_CAST_I_I]], [[SUB_PTR_RHS_CAST_I_I]]
+; CHECK-NEXT:    [[SUB_PTR_DIV_I_I:%.*]] = ashr exact i64 [[SUB_PTR_SUB_I_I]], 3
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i64 [[SUB_PTR_DIV_I_I]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[OPERATOR_ACC_EXIT:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       operator_acc.exit:
+; CHECK-NEXT:    [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[OPERATOR_ACC_EXIT]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 [[I_010]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ADD_PTR_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], 1.000000e+00
+; CHECK-NEXT:    store double [[ADD]], ptr [[ADD_PTR_I]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_010]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INC]], [[SUB_PTR_DIV_I_I]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[OPERATOR_ACC_EXIT]], label [[FOR_COND_CLEANUP]]
+;
+entry:
+  %vec.addr = alloca ptr, align 8
+  %count = alloca i64, align 8
+  %i = alloca i64, align 8
+  store ptr %vec, ptr %vec.addr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr %count)
+  %0 = load ptr, ptr %vec.addr, align 8
+  %call = call noundef i64 @alloc(ptr noundef nonnull align 8 dereferenceable(24) %0)
+  store i64 %call, ptr %count, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr %i)
+  store i64 0, ptr %i, align 8
+  br label %for.cond
+
+for.cond:
+  %1 = load i64, ptr %i, align 8
+  %2 = load i64, ptr %count, align 8
+  %cmp = icmp slt i64 %1, %2
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  call void @llvm.lifetime.end.p0(i64 8, ptr %i)
+  br label %for.end
+
+for.body:
+  %3 = load ptr, ptr %vec.addr, align 8
+  %4 = load i64, ptr %i, align 8
+  %call1 = call noundef nonnull align 8 dereferenceable(8) ptr @operator_acc(ptr noundef nonnull align 8 dereferenceable(24) %3, i64 noundef %4)
+  %5 = load double, ptr %call1, align 8, !tbaa !8
+  %add = fadd double %5, 1.000000e+00
+  store double %add, ptr %call1, align 8, !tbaa !8
+  br label %for.inc
+
+for.inc:
+  %6 = load i64, ptr %i, align 8
+  %inc = add nsw i64 %6, 1
+  store i64 %inc, ptr %i, align 8
+  br label %for.cond
+
+for.end:
+  call void @llvm.lifetime.end.p0(i64 8, ptr %count)
+  ret void
+}
+
 ; -------------------------------------------------------------------------
 ; Test case for runtime check removal when accessing elements in a nested loop
 ; (PR64881)
@@ -386,3 +459,15 @@ if.end:                                           ; preds = %entry
   %add.ptr = getelementptr inbounds i32, ptr %2, i64 %idx.ext
   ret ptr %add.ptr
 }
+
+
+!0 = !{!1, !2, i64 0}
+!1 = !{!"_ZTSNSt12_Vector_baseIdSaIdEE17_Vector_impl_dataE", !2, i64 0, !2, i64 8, !2, i64 16}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"long", !3, i64 0}
+!7 = !{!1, !2, i64 8}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"double", !3, i64 0}

>From fe300f35ca1a4ae06f3f76f3d8732f75e76a3c16 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconsteq at gmail.com>
Date: Fri, 3 Nov 2023 06:46:19 -0700
Subject: [PATCH 25/76] [libc++][hardening] Add tests for the hardened mode
 with ABI breaks. (#71020)

Add a new test mode that enables the hardened mode in combination with
ABI-breaking changes (only bounded iterators currently) and reenable the
bounded iterator tests for `span` and `string_view`.
---
 ...Generic-hardened-mode-with-abi-breaks.cmake |  2 ++
 .../debug.iterator-indexing.pass.cpp           |  4 ++--
 .../debug.iterator-indexing.pass.cpp           |  4 ++--
 libcxx/utils/ci/buildkite-pipeline.yml         | 18 ++++++++++++++++++
 libcxx/utils/ci/run-buildbot                   |  6 ++++++
 libcxx/utils/libcxx/test/features.py           |  1 +
 6 files changed, 31 insertions(+), 4 deletions(-)
 create mode 100644 libcxx/cmake/caches/Generic-hardened-mode-with-abi-breaks.cmake

diff --git a/libcxx/cmake/caches/Generic-hardened-mode-with-abi-breaks.cmake b/libcxx/cmake/caches/Generic-hardened-mode-with-abi-breaks.cmake
new file mode 100644
index 000000000000000..de5d3820fba4cc6
--- /dev/null
+++ b/libcxx/cmake/caches/Generic-hardened-mode-with-abi-breaks.cmake
@@ -0,0 +1,2 @@
+set(LIBCXX_HARDENING_MODE "hardened" CACHE STRING "")
+set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS" CACHE STRING "")
diff --git a/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp
index c2b5598c77cef56..2449a5f250398c8 100644
--- a/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp
@@ -9,8 +9,8 @@
 
 // Make sure that std::span's iterators check for OOB accesses when the debug mode is enabled.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
+// UNSUPPORTED: libcpp-hardening-mode=unchecked
 
 #include <span>
 
diff --git a/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp b/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp
index 797b8bad7c4d4a7..3dc998a458a12ea 100644
--- a/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp
+++ b/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp
@@ -8,8 +8,8 @@
 
 // Make sure that std::string_view's iterators check for OOB accesses when the debug mode is enabled.
 
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
+// UNSUPPORTED: libcpp-hardening-mode=unchecked
 
 #include <string_view>
 
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index 5a5bc39c5797bdd..41b3d27d48f7fdd 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -422,6 +422,24 @@ steps:
           limit: 2
     timeout_in_minutes: 120
 
+  - label: "Hardened mode with ABI breaks"
+    command: "libcxx/utils/ci/run-buildbot generic-hardened-mode-with-abi-breaks"
+    artifact_paths:
+      - "**/test-results.xml"
+      - "**/*.abilist"
+    env:
+        CC: "clang-${LLVM_HEAD_VERSION}"
+        CXX: "clang++-${LLVM_HEAD_VERSION}"
+        ENABLE_CLANG_TIDY: "On"
+    agents:
+      queue: "libcxx-builders"
+      os: "linux"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 2
+    timeout_in_minutes: 120
+
   - label: "Safe mode"
     command: "libcxx/utils/ci/run-buildbot generic-safe-mode"
     artifact_paths:
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index ebb255243ba9c15..c4834330d7e31b8 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -401,6 +401,12 @@ generic-hardened-mode)
     check-runtimes
     check-abi-list
 ;;
+generic-hardened-mode-with-abi-breaks)
+    clean
+    generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-hardened-mode-with-abi-breaks.cmake"
+    check-runtimes
+    check-abi-list
+;;
 generic-safe-mode)
     clean
     generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-safe-mode.cmake"
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index ae1c381a167eb5e..29822f55521360b 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -296,6 +296,7 @@ def _getAndroidDeviceApi(cfg):
     "_LIBCPP_HAS_THREAD_API_PTHREAD": "libcpp-has-thread-api-pthread",
     "_LIBCPP_NO_VCRUNTIME": "libcpp-no-vcruntime",
     "_LIBCPP_ABI_VERSION": "libcpp-abi-version",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators",
     "_LIBCPP_HAS_NO_FILESYSTEM": "no-filesystem",
     "_LIBCPP_HAS_NO_RANDOM_DEVICE": "no-random-device",
     "_LIBCPP_HAS_NO_LOCALIZATION": "no-localization",

>From 1061c0150b587c721aaddd8250d562976dbcd7d6 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Fri, 3 Nov 2023 09:48:38 -0400
Subject: [PATCH 26/76] [libc++] Remove legacy feature
 suse-linux-enterprise-server-11 (#71103)

This Lit feature is not defined anywhere anymore and that platform is
not on our radar for support, so this is basically dead code.
---
 .../syserr.errcat.objects/generic_category.pass.cpp              | 1 -
 .../syserr.errcat/syserr.errcat.objects/system_category.pass.cpp | 1 -
 2 files changed, 2 deletions(-)

diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
index 90e5bd39e5b016d..5b63272a3a1a6a9 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: suse-linux-enterprise-server-11
 // XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11|12}}
 
 // <system_error>
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index 1de1b59e5c6c3d6..1caf5cb2ccf73d5 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -12,7 +12,6 @@
 
 // const error_category& system_category();
 
-// XFAIL: suse-linux-enterprise-server-11
 // XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11|12}}
 
 #include <system_error>

>From 6e4692c9ee8718eed19288f0e5b2d9d6655cb56d Mon Sep 17 00:00:00 2001
From: Jessica Del <50999226+OutOfCache at users.noreply.github.com>
Date: Fri, 3 Nov 2023 14:48:59 +0100
Subject: [PATCH 27/76] [AMDGPU] - Add s_wqm intrinsics (#71048)

Add intrinsics to generate `s_wqm_b32` and `s_wqm_b64`.

Support VGPR arguments by inserting a `v_readfirstlane`.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  6 ++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  6 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  6 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll   | 87 +++++++++++++++++++
 5 files changed, 104 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 33fa3985e64d8b1..1254499648fefff 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1944,6 +1944,12 @@ def int_amdgcn_s_bitreplicate :
 def int_amdgcn_s_quadmask :
   DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
 
+// Lowers to S_WQM_B{32,64}
+// The argument must be uniform; otherwise, the result is undefined.
+// Does not set WQM; merely calculates the bitmask.
+def int_amdgcn_s_wqm :
+  DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
+
 class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
     [data_ty],
     [
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 047108fd06db5f9..31637cc70ef3f1b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2996,6 +2996,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     case Intrinsic::amdgcn_inverse_ballot:
     case Intrinsic::amdgcn_s_bitreplicate:
     case Intrinsic::amdgcn_s_quadmask:
+    case Intrinsic::amdgcn_s_wqm:
       applyDefaultMapping(OpdMapper);
       constrainOpWithReadfirstlane(B, MI, 2); // Mask
       return;
@@ -4541,7 +4542,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
       break;
     }
-    case Intrinsic::amdgcn_s_quadmask: {
+    case Intrinsic::amdgcn_s_quadmask:
+    case Intrinsic::amdgcn_s_wqm: {
       Register MaskReg = MI.getOperand(2).getReg();
       unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f61735a59c97078..d73328d67f6078e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6484,10 +6484,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
     return CreatedBB;
   }
 
-  // Legalize S_BITREPLICATE and S_QUADMASK
+  // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
   if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
       MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
-      MI.getOpcode() == AMDGPU::S_QUADMASK_B64) {
+      MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
+      MI.getOpcode() == AMDGPU::S_WQM_B32 ||
+      MI.getOpcode() == AMDGPU::S_WQM_B64) {
     MachineOperand &Src = MI.getOperand(1);
     if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index a28921a7ff33f6d..0a16a07cb5ec35b 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -216,8 +216,10 @@ let Defs = [SCC] in {
   def S_NOT_B64 : SOP1_64 <"s_not_b64",
     [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))]
   >;
-  def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
-  def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
+  def S_WQM_B32 : SOP1_32 <"s_wqm_b32",
+    [(set i32:$sdst, (int_amdgcn_s_wqm i32:$src0))]>;
+  def S_WQM_B64 : SOP1_64 <"s_wqm_b64",
+    [(set i64:$sdst, (int_amdgcn_s_wqm i64:$src0))]>;
 } // End Defs = [SCC]
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
new file mode 100644
index 000000000000000..6676dac19ba797f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11 %s
+
+declare i32 @llvm.amdgcn.s.wqm.i32(i32)
+declare i64 @llvm.amdgcn.s.wqm.i64(i64)
+
+define i32 @test_s_wqm_constant_i32() {
+; GFX11-LABEL: test_s_wqm_constant_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_wqm_b32 s0, 0x85fe3a92
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 u0x85FE3A92)
+  ret i32 %br
+}
+
+define amdgpu_cs void @test_s_wqm_sgpr_i32(i32 inreg %mask, ptr addrspace(1) %out) {
+; GFX11-LABEL: test_s_wqm_sgpr_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_wqm_b32 s0, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask)
+  store i32 %br, ptr addrspace(1) %out
+  ret void
+}
+
+define i32 @test_s_wqm_vgpr_i32(i32 %mask) {
+; GFX11-LABEL: test_s_wqm_vgpr_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_wqm_b32 s0, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask)
+  ret i32 %br
+}
+
+define i64 @test_s_wqm_constant_i64() {
+; GFX11-LABEL: test_s_wqm_constant_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, 0x85fe3a92
+; GFX11-NEXT:    s_mov_b32 s1, 0x3a9285fe
+; GFX11-NEXT:    s_wqm_b64 s[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 u0x3A9285FE85FE3A92)
+  ret i64 %br
+}
+
+define amdgpu_cs void @test_s_wqm_sgpr_i64(i64 inreg %mask, ptr addrspace(1) %out) {
+; GFX11-LABEL: test_s_wqm_sgpr_i64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_wqm_b64 s[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask)
+  store i64 %br, ptr addrspace(1) %out
+  ret void
+}
+
+define i64 @test_s_wqm_vgpr_i64(i64 %mask) {
+; GFX11-LABEL: test_s_wqm_vgpr_i64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    s_wqm_b64 s[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask)
+  ret i64 %br
+}

>From 03110ddeb2c21d1b500e7f6f6e70134e269738bc Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 14:47:27 +0100
Subject: [PATCH 28/76] [IR] Remove ZExtOperator (NFC)

Now that zext constant expressions are no longer supported,
ZExtInst should be used instead.
---
 llvm/include/llvm/IR/Operator.h                          | 2 --
 llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp  | 8 ++++----
 llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
index 2246bb06549c7aa..12733529a74dc7d 100644
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -367,8 +367,6 @@ class LShrOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {
 };
 
-class ZExtOperator : public ConcreteOperator<Operator, Instruction::ZExt> {};
-
 class GEPOperator
   : public ConcreteOperator<Operator, Instruction::GetElementPtr> {
   friend class GetElementPtrInst;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7574987d0e23141..f06657c8cd7633d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5574,8 +5574,8 @@ Instruction *InstCombinerImpl::foldICmpWithZextOrSext(ICmpInst &ICmp) {
   // icmp Pred (ext X), (ext Y)
   Value *Y;
   if (match(ICmp.getOperand(1), m_ZExtOrSExt(m_Value(Y)))) {
-    bool IsZext0 = isa<ZExtOperator>(ICmp.getOperand(0));
-    bool IsZext1 = isa<ZExtOperator>(ICmp.getOperand(1));
+    bool IsZext0 = isa<ZExtInst>(ICmp.getOperand(0));
+    bool IsZext1 = isa<ZExtInst>(ICmp.getOperand(1));
 
     if (IsZext0 != IsZext1) {
         // If X and Y and both i1
@@ -5834,8 +5834,8 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
     return nullptr;
   assert(MulInstr->getOpcode() == Instruction::Mul);
 
-  auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)),
-       *RHS = cast<ZExtOperator>(MulInstr->getOperand(1));
+  auto *LHS = cast<ZExtInst>(MulInstr->getOperand(0)),
+       *RHS = cast<ZExtInst>(MulInstr->getOperand(1));
   assert(LHS->getOpcode() == Instruction::ZExt);
   assert(RHS->getOpcode() == Instruction::ZExt);
   Value *A = LHS->getOperand(0), *B = RHS->getOperand(0);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 559eb2ef4795eb1..1474cc9c3fdb54e 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -916,7 +916,7 @@ InstCombinerImpl::foldBinOpOfSelectAndCastOfSelectCondition(BinaryOperator &I) {
 
   auto NewFoldedConst = [&](bool IsTrueArm, Value *V) {
     bool IsCastOpRHS = (CastOp == RHS);
-    bool IsZExt = isa<ZExtOperator>(CastOp);
+    bool IsZExt = isa<ZExtInst>(CastOp);
     Constant *C;
 
     if (IsTrueArm) {

>From 48be81e172911eb8cdae8a1ffd0166edfb2cfc04 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd at linux.alibaba.com>
Date: Fri, 3 Nov 2023 21:59:44 +0800
Subject: [PATCH 29/76] [NFC] [Serializer] Pack information in serializer
 (#69287)

Previously, the boolean values will occupy spaces that can contain
integers. It wastes the spaces especially if the boolean values are
serialized consecutively. The patch tries to pack such consecutive
boolean values (and enum values) so that we can save more spaces and so
the times.

Before the patch, we need 4.478s (in my machine) to build the std module
(https://libcxx.llvm.org/Modules.html) with 28712 bytes for size of the
BMI. After the patch, the time becomes to 4.374s and the size becomes to
27388 bytes for the size of the BMI.

This is intended to be a NFC patch.

This patch doesn't optimize all such cases. We can do it later after we
have consensus on this.
---
 clang/include/clang/AST/DeclBase.h            |   2 +-
 clang/include/clang/Serialization/ASTReader.h |  47 ++
 clang/include/clang/Serialization/ASTWriter.h |  53 +++
 clang/lib/Serialization/ASTReaderDecl.cpp     | 230 +++++----
 clang/lib/Serialization/ASTWriter.cpp         |  32 +-
 clang/lib/Serialization/ASTWriterDecl.cpp     | 448 ++++++++----------
 clang/test/Modules/decl-params-determinisim.m |  16 +-
 7 files changed, 471 insertions(+), 357 deletions(-)

diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index f784fa73af5bad5..fdc59ac7419d9e3 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -211,7 +211,7 @@ class alignas(8) Decl {
   /// The kind of ownership a declaration has, for visibility purposes.
   /// This enumeration is designed such that higher values represent higher
   /// levels of name hiding.
-  enum class ModuleOwnershipKind : unsigned {
+  enum class ModuleOwnershipKind : unsigned char {
     /// This declaration is not owned by a module.
     Unowned,
 
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 531ad94f0906ac0..bafbe779d60acff 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2407,6 +2407,53 @@ class ASTReader
   bool isProcessingUpdateRecords() { return ProcessingUpdateRecords; }
 };
 
+/// A simple helper class to unpack an integer to bits and consuming
+/// the bits in order.
+class BitsUnpacker {
+  constexpr static uint32_t BitsIndexUpbound = 32;
+
+public:
+  BitsUnpacker(uint32_t V) { updateValue(V); }
+  BitsUnpacker(const BitsUnpacker &) = delete;
+  BitsUnpacker(BitsUnpacker &&) = delete;
+  BitsUnpacker operator=(const BitsUnpacker &) = delete;
+  BitsUnpacker operator=(BitsUnpacker &&) = delete;
+  ~BitsUnpacker() {
+#ifndef NDEBUG
+    while (isValid())
+      assert(!getNextBit() && "There are unprocessed bits!");
+#endif
+  }
+
+  void updateValue(uint32_t V) {
+    Value = V;
+    CurrentBitsIndex = 0;
+  }
+
+  bool getNextBit() {
+    assert(isValid());
+    return Value & (1 << CurrentBitsIndex++);
+  }
+
+  uint32_t getNextBits(uint32_t Width) {
+    assert(isValid());
+    assert(Width < BitsIndexUpbound);
+    uint32_t Ret = (Value >> CurrentBitsIndex) & ((1 << Width) - 1);
+    CurrentBitsIndex += Width;
+    return Ret;
+  }
+
+  bool canGetNextNBits(uint32_t Width) const {
+    return CurrentBitsIndex + Width < BitsIndexUpbound;
+  }
+
+private:
+  bool isValid() const { return CurrentBitsIndex < BitsIndexUpbound; }
+
+  uint32_t Value;
+  uint32_t CurrentBitsIndex = ~0;
+};
+
 } // namespace clang
 
 #endif // LLVM_CLANG_SERIALIZATION_ASTREADER_H
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 98445d40ebd82c3..3019bbc2ddc9cc7 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -830,6 +830,59 @@ class PCHGenerator : public SemaConsumer {
   bool hasEmittedPCH() const { return Buffer->IsComplete; }
 };
 
+/// A simple helper class to pack several bits in order into (a) 32 bit
+/// integer(s).
+class BitsPacker {
+  constexpr static uint32_t BitIndexUpbound = 32u;
+
+public:
+  BitsPacker() = default;
+  BitsPacker(const BitsPacker &) = delete;
+  BitsPacker(BitsPacker &&) = delete;
+  BitsPacker operator=(const BitsPacker &) = delete;
+  BitsPacker operator=(BitsPacker &&) = delete;
+  ~BitsPacker() {
+    assert(!hasUnconsumedValues() && "There are unprocessed bits!");
+  }
+
+  void addBit(bool Value) { addBits(Value, 1); }
+  void addBits(uint32_t Value, uint32_t BitsWidth) {
+    assert(BitsWidth < BitIndexUpbound);
+    assert((Value < (1u << BitsWidth)) && "Passing narrower bit width!");
+
+    if (CurrentBitIndex + BitsWidth >= BitIndexUpbound) {
+      Values.push_back(0);
+      CurrentBitIndex = 0;
+    }
+
+    assert(CurrentBitIndex < BitIndexUpbound);
+    Values.back() |= Value << CurrentBitIndex;
+    CurrentBitIndex += BitsWidth;
+  }
+
+  bool hasUnconsumedValues() const {
+    return ConsumingValueIndex < Values.size();
+  }
+  uint32_t getNextValue() {
+    assert(hasUnconsumedValues());
+    return Values[ConsumingValueIndex++];
+  }
+
+  // We can convert the packer to an uint32_t if there is only one values.
+  operator uint32_t() {
+    assert(Values.size() == 1);
+    return getNextValue();
+  }
+
+private:
+  SmallVector<uint64_t, 4> Values;
+  uint16_t ConsumingValueIndex = 0;
+  // Initialize CurrentBitIndex with an invalid value
+  // to make it easier to update Values. See the implementation
+  // of `addBits` to see the details.
+  uint16_t CurrentBitIndex = BitIndexUpbound;
+};
+
 } // namespace clang
 
 #endif // LLVM_CLANG_SERIALIZATION_ASTWRITER_H
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 00f57f88781e632..a63911cb4adfba5 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -617,24 +617,29 @@ void ASTDeclReader::VisitDecl(Decl *D) {
                            Reader.getContext());
   }
   D->setLocation(ThisDeclLoc);
-  D->InvalidDecl = Record.readInt();
-  if (Record.readInt()) { // hasAttrs
+
+  BitsUnpacker DeclBits(Record.readInt());
+  D->InvalidDecl = DeclBits.getNextBit();
+  bool HasAttrs = DeclBits.getNextBit();
+  D->setImplicit(DeclBits.getNextBit());
+  D->Used = DeclBits.getNextBit();
+  IsDeclMarkedUsed |= D->Used;
+  D->setReferenced(DeclBits.getNextBit());
+  D->setTopLevelDeclInObjCContainer(DeclBits.getNextBit());
+  D->setAccess((AccessSpecifier)DeclBits.getNextBits(/*Width=*/2));
+  D->FromASTFile = true;
+  auto ModuleOwnership =
+      (Decl::ModuleOwnershipKind)DeclBits.getNextBits(/*Width=*/3);
+  bool ModulePrivate =
+      (ModuleOwnership == Decl::ModuleOwnershipKind::ModulePrivate);
+
+  if (HasAttrs) {
     AttrVec Attrs;
     Record.readAttributes(Attrs);
     // Avoid calling setAttrs() directly because it uses Decl::getASTContext()
     // internally which is unsafe during derialization.
     D->setAttrsImpl(Attrs, Reader.getContext());
   }
-  D->setImplicit(Record.readInt());
-  D->Used = Record.readInt();
-  IsDeclMarkedUsed |= D->Used;
-  D->setReferenced(Record.readInt());
-  D->setTopLevelDeclInObjCContainer(Record.readInt());
-  D->setAccess((AccessSpecifier)Record.readInt());
-  D->FromASTFile = true;
-  auto ModuleOwnership = (Decl::ModuleOwnershipKind)Record.readInt();
-  bool ModulePrivate =
-      (ModuleOwnership == Decl::ModuleOwnershipKind::ModulePrivate);
 
   // Determine whether this declaration is part of a (sub)module. If so, it
   // may not yet be visible.
@@ -750,12 +755,13 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitTagDecl(TagDecl *TD) {
   VisitTypeDecl(TD);
 
   TD->IdentifierNamespace = Record.readInt();
-  TD->setTagKind((TagDecl::TagKind)Record.readInt());
-  if (!isa<CXXRecordDecl>(TD))
-    TD->setCompleteDefinition(Record.readInt());
-  TD->setEmbeddedInDeclarator(Record.readInt());
-  TD->setFreeStanding(Record.readInt());
-  TD->setCompleteDefinitionRequired(Record.readInt());
+
+  BitsUnpacker TagDeclBits(Record.readInt());
+  TD->setTagKind((TagDecl::TagKind)TagDeclBits.getNextBits(/*Width=*/3));
+  TD->setCompleteDefinition(TagDeclBits.getNextBit());
+  TD->setEmbeddedInDeclarator(TagDeclBits.getNextBit());
+  TD->setFreeStanding(TagDeclBits.getNextBit());
+  TD->setCompleteDefinitionRequired(TagDeclBits.getNextBit());
   TD->setBraceRange(readSourceRange());
 
   switch (Record.readInt()) {
@@ -787,11 +793,13 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
   else
     ED->setIntegerType(Record.readType());
   ED->setPromotionType(Record.readType());
-  ED->setNumPositiveBits(Record.readInt());
-  ED->setNumNegativeBits(Record.readInt());
-  ED->setScoped(Record.readInt());
-  ED->setScopedUsingClassTag(Record.readInt());
-  ED->setFixed(Record.readInt());
+
+  BitsUnpacker EnumDeclBits(Record.readInt());
+  ED->setNumPositiveBits(EnumDeclBits.getNextBits(/*Width=*/8));
+  ED->setNumNegativeBits(EnumDeclBits.getNextBits(/*Width=*/8));
+  ED->setScoped(EnumDeclBits.getNextBit());
+  ED->setScopedUsingClassTag(EnumDeclBits.getNextBit());
+  ED->setFixed(EnumDeclBits.getNextBit());
 
   ED->setHasODRHash(true);
   ED->ODRHash = Record.readInt();
@@ -834,18 +842,22 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
 ASTDeclReader::RedeclarableResult
 ASTDeclReader::VisitRecordDeclImpl(RecordDecl *RD) {
   RedeclarableResult Redecl = VisitTagDecl(RD);
-  RD->setHasFlexibleArrayMember(Record.readInt());
-  RD->setAnonymousStructOrUnion(Record.readInt());
-  RD->setHasObjectMember(Record.readInt());
-  RD->setHasVolatileMember(Record.readInt());
-  RD->setNonTrivialToPrimitiveDefaultInitialize(Record.readInt());
-  RD->setNonTrivialToPrimitiveCopy(Record.readInt());
-  RD->setNonTrivialToPrimitiveDestroy(Record.readInt());
-  RD->setHasNonTrivialToPrimitiveDefaultInitializeCUnion(Record.readInt());
-  RD->setHasNonTrivialToPrimitiveDestructCUnion(Record.readInt());
-  RD->setHasNonTrivialToPrimitiveCopyCUnion(Record.readInt());
-  RD->setParamDestroyedInCallee(Record.readInt());
-  RD->setArgPassingRestrictions((RecordArgPassingKind)Record.readInt());
+
+  BitsUnpacker RecordDeclBits(Record.readInt());
+  RD->setHasFlexibleArrayMember(RecordDeclBits.getNextBit());
+  RD->setAnonymousStructOrUnion(RecordDeclBits.getNextBit());
+  RD->setHasObjectMember(RecordDeclBits.getNextBit());
+  RD->setHasVolatileMember(RecordDeclBits.getNextBit());
+  RD->setNonTrivialToPrimitiveDefaultInitialize(RecordDeclBits.getNextBit());
+  RD->setNonTrivialToPrimitiveCopy(RecordDeclBits.getNextBit());
+  RD->setNonTrivialToPrimitiveDestroy(RecordDeclBits.getNextBit());
+  RD->setHasNonTrivialToPrimitiveDefaultInitializeCUnion(
+      RecordDeclBits.getNextBit());
+  RD->setHasNonTrivialToPrimitiveDestructCUnion(RecordDeclBits.getNextBit());
+  RD->setHasNonTrivialToPrimitiveCopyCUnion(RecordDeclBits.getNextBit());
+  RD->setParamDestroyedInCallee(RecordDeclBits.getNextBit());
+  RD->setArgPassingRestrictions(
+      (RecordArgPassingKind)RecordDeclBits.getNextBits(/*Width=*/2));
   return Redecl;
 }
 
@@ -1046,32 +1058,35 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
 
   // FunctionDecl's body is handled last at ASTDeclReader::Visit,
   // after everything else is read.
+  BitsUnpacker FunctionDeclBits(Record.readInt());
 
-  FD->setStorageClass(static_cast<StorageClass>(Record.readInt()));
-  FD->setInlineSpecified(Record.readInt());
-  FD->setImplicitlyInline(Record.readInt());
-  FD->setVirtualAsWritten(Record.readInt());
+  FD->setStorageClass((StorageClass)FunctionDeclBits.getNextBits(/*Width=*/3));
+  FD->setInlineSpecified(FunctionDeclBits.getNextBit());
+  FD->setImplicitlyInline(FunctionDeclBits.getNextBit());
+  FD->setVirtualAsWritten(FunctionDeclBits.getNextBit());
   // We defer calling `FunctionDecl::setPure()` here as for methods of
   // `CXXTemplateSpecializationDecl`s, we may not have connected up the
   // definition (which is required for `setPure`).
-  const bool Pure = Record.readInt();
-  FD->setHasInheritedPrototype(Record.readInt());
-  FD->setHasWrittenPrototype(Record.readInt());
-  FD->setDeletedAsWritten(Record.readInt());
-  FD->setTrivial(Record.readInt());
-  FD->setTrivialForCall(Record.readInt());
-  FD->setDefaulted(Record.readInt());
-  FD->setExplicitlyDefaulted(Record.readInt());
-  FD->setIneligibleOrNotSelected(Record.readInt());
-  FD->setHasImplicitReturnZero(Record.readInt());
-  FD->setConstexprKind(static_cast<ConstexprSpecKind>(Record.readInt()));
-  FD->setUsesSEHTry(Record.readInt());
-  FD->setHasSkippedBody(Record.readInt());
-  FD->setIsMultiVersion(Record.readInt());
-  FD->setLateTemplateParsed(Record.readInt());
-  FD->setFriendConstraintRefersToEnclosingTemplate(Record.readInt());
-
-  FD->setCachedLinkage(static_cast<Linkage>(Record.readInt()));
+  const bool Pure = FunctionDeclBits.getNextBit();
+  FD->setHasInheritedPrototype(FunctionDeclBits.getNextBit());
+  FD->setHasWrittenPrototype(FunctionDeclBits.getNextBit());
+  FD->setDeletedAsWritten(FunctionDeclBits.getNextBit());
+  FD->setTrivial(FunctionDeclBits.getNextBit());
+  FD->setTrivialForCall(FunctionDeclBits.getNextBit());
+  FD->setDefaulted(FunctionDeclBits.getNextBit());
+  FD->setExplicitlyDefaulted(FunctionDeclBits.getNextBit());
+  FD->setIneligibleOrNotSelected(FunctionDeclBits.getNextBit());
+  FD->setHasImplicitReturnZero(FunctionDeclBits.getNextBit());
+  FD->setConstexprKind(
+      (ConstexprSpecKind)FunctionDeclBits.getNextBits(/*Width=*/2));
+  FD->setUsesSEHTry(FunctionDeclBits.getNextBit());
+  FD->setHasSkippedBody(FunctionDeclBits.getNextBit());
+  FD->setIsMultiVersion(FunctionDeclBits.getNextBit());
+  FD->setLateTemplateParsed(FunctionDeclBits.getNextBit());
+  FD->setFriendConstraintRefersToEnclosingTemplate(
+      FunctionDeclBits.getNextBit());
+  FD->setCachedLinkage((Linkage)FunctionDeclBits.getNextBits(/*Width=*/3));
+
   FD->EndRangeLoc = readSourceLocation();
   FD->setDefaultLoc(readSourceLocation());
 
@@ -1575,26 +1590,29 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
   RedeclarableResult Redecl = VisitRedeclarable(VD);
   VisitDeclaratorDecl(VD);
 
-  VD->VarDeclBits.SClass = (StorageClass)Record.readInt();
-  VD->VarDeclBits.TSCSpec = Record.readInt();
-  VD->VarDeclBits.InitStyle = Record.readInt();
-  VD->VarDeclBits.ARCPseudoStrong = Record.readInt();
+  BitsUnpacker VarDeclBits(Record.readInt());
+  VD->VarDeclBits.SClass = (StorageClass)VarDeclBits.getNextBits(/*Width=*/3);
+  VD->VarDeclBits.TSCSpec = VarDeclBits.getNextBits(/*Width=*/2);
+  VD->VarDeclBits.InitStyle = VarDeclBits.getNextBits(/*Width=*/2);
+  VD->VarDeclBits.ARCPseudoStrong = VarDeclBits.getNextBit();
   bool HasDeducedType = false;
   if (!isa<ParmVarDecl>(VD)) {
     VD->NonParmVarDeclBits.IsThisDeclarationADemotedDefinition =
-        Record.readInt();
-    VD->NonParmVarDeclBits.ExceptionVar = Record.readInt();
-    VD->NonParmVarDeclBits.NRVOVariable = Record.readInt();
-    VD->NonParmVarDeclBits.CXXForRangeDecl = Record.readInt();
-    VD->NonParmVarDeclBits.ObjCForDecl = Record.readInt();
-    VD->NonParmVarDeclBits.IsInline = Record.readInt();
-    VD->NonParmVarDeclBits.IsInlineSpecified = Record.readInt();
-    VD->NonParmVarDeclBits.IsConstexpr = Record.readInt();
-    VD->NonParmVarDeclBits.IsInitCapture = Record.readInt();
-    VD->NonParmVarDeclBits.PreviousDeclInSameBlockScope = Record.readInt();
-    VD->NonParmVarDeclBits.ImplicitParamKind = Record.readInt();
-    VD->NonParmVarDeclBits.EscapingByref = Record.readInt();
-    HasDeducedType = Record.readInt();
+        VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.ExceptionVar = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.NRVOVariable = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.CXXForRangeDecl = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.ObjCForDecl = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.IsInline = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.IsInlineSpecified = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.IsConstexpr = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.IsInitCapture = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.PreviousDeclInSameBlockScope =
+        VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.ImplicitParamKind =
+        VarDeclBits.getNextBits(/*Width*/ 3);
+    VD->NonParmVarDeclBits.EscapingByref = VarDeclBits.getNextBit();
+    HasDeducedType = VarDeclBits.getNextBit();
   }
 
   // If this variable has a deduced type, defer reading that type until we are
@@ -1606,7 +1624,7 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
     VD->setType(Reader.GetType(DeferredTypeID));
   DeferredTypeID = 0;
 
-  auto VarLinkage = static_cast<Linkage>(Record.readInt());
+  auto VarLinkage = Linkage(VarDeclBits.getNextBits(/*Width=*/3));
   VD->setCachedLinkage(VarLinkage);
 
   // Reconstruct the one piece of the IdentifierNamespace that we need.
@@ -1614,18 +1632,18 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
       VD->getLexicalDeclContext()->isFunctionOrMethod())
     VD->setLocalExternDecl();
 
+  if (VarDeclBits.getNextBit()) {
+    Reader.DefinitionSource[VD] =
+        Loc.F->Kind == ModuleKind::MK_MainFile ||
+        Reader.getContext().getLangOpts().BuildingPCHWithObjectFile;
+  }
+
   if (VD->hasAttr<BlocksAttr>()) {
     Expr *CopyExpr = Record.readExpr();
     if (CopyExpr)
       Reader.getContext().setBlockVarCopyInit(VD, CopyExpr, Record.readInt());
   }
 
-  if (Record.readInt()) {
-    Reader.DefinitionSource[VD] =
-        Loc.F->Kind == ModuleKind::MK_MainFile ||
-        Reader.getContext().getLangOpts().BuildingPCHWithObjectFile;
-  }
-
   enum VarKind {
     VarNotTemplate = 0, VarTemplate, StaticDataMemberSpecialization
   };
@@ -1679,9 +1697,11 @@ void ASTDeclReader::VisitImplicitParamDecl(ImplicitParamDecl *PD) {
 
 void ASTDeclReader::VisitParmVarDecl(ParmVarDecl *PD) {
   VisitVarDecl(PD);
-  unsigned isObjCMethodParam = Record.readInt();
-  unsigned scopeDepth = Record.readInt();
-  unsigned scopeIndex = Record.readInt();
+
+  BitsUnpacker ParmVarDeclBits(Record.readInt());
+  unsigned isObjCMethodParam = ParmVarDeclBits.getNextBit();
+  unsigned scopeDepth = ParmVarDeclBits.getNextBits(/*Width=*/7);
+  unsigned scopeIndex = ParmVarDeclBits.getNextBits(/*Width=*/8);
   unsigned declQualifier = Record.readInt();
   if (isObjCMethodParam) {
     assert(scopeDepth == 0);
@@ -1690,9 +1710,10 @@ void ASTDeclReader::VisitParmVarDecl(ParmVarDecl *PD) {
   } else {
     PD->setScopeInfo(scopeDepth, scopeIndex);
   }
-  PD->ParmVarDeclBits.IsKNRPromoted = Record.readInt();
-  PD->ParmVarDeclBits.HasInheritedDefaultArg = Record.readInt();
-  if (Record.readInt()) // hasUninstantiatedDefaultArg.
+  PD->ParmVarDeclBits.IsKNRPromoted = ParmVarDeclBits.getNextBit();
+
+  PD->ParmVarDeclBits.HasInheritedDefaultArg = ParmVarDeclBits.getNextBit();
+  if (ParmVarDeclBits.getNextBit()) // hasUninstantiatedDefaultArg.
     PD->setUninstantiatedDefaultArg(Record.readExpr());
   PD->ExplicitObjectParameterIntroducerLoc = Record.readSourceLocation();
 
@@ -1791,8 +1812,10 @@ void ASTDeclReader::VisitLabelDecl(LabelDecl *D) {
 void ASTDeclReader::VisitNamespaceDecl(NamespaceDecl *D) {
   RedeclarableResult Redecl = VisitRedeclarable(D);
   VisitNamedDecl(D);
-  D->setInline(Record.readInt());
-  D->setNested(Record.readInt());
+
+  BitsUnpacker NamespaceDeclBits(Record.readInt());
+  D->setInline(NamespaceDeclBits.getNextBit());
+  D->setNested(NamespaceDeclBits.getNextBit());
   D->LocStart = readSourceLocation();
   D->RBraceLoc = readSourceLocation();
 
@@ -1927,8 +1950,16 @@ void ASTDeclReader::VisitUnresolvedUsingIfExistsDecl(
 void ASTDeclReader::ReadCXXDefinitionData(
     struct CXXRecordDecl::DefinitionData &Data, const CXXRecordDecl *D,
     Decl *LambdaContext, unsigned IndexInLambdaContext) {
-#define FIELD(Name, Width, Merge) Data.Name = Record.readInt();
+
+  BitsUnpacker CXXRecordDeclBits = Record.readInt();
+
+#define FIELD(Name, Width, Merge)                                              \
+  if (!CXXRecordDeclBits.canGetNextNBits(Width))                         \
+    CXXRecordDeclBits.updateValue(Record.readInt());                           \
+  Data.Name = CXXRecordDeclBits.getNextBits(Width);
+
 #include "clang/AST/CXXRecordDeclDefinitionBits.def"
+#undef FIELD
 
   // Note: the caller has deserialized the IsLambda bit already.
   Data.ODRHash = Record.readInt();
@@ -1963,12 +1994,15 @@ void ASTDeclReader::ReadCXXDefinitionData(
     using Capture = LambdaCapture;
 
     auto &Lambda = static_cast<CXXRecordDecl::LambdaDefinitionData &>(Data);
-    Lambda.DependencyKind = Record.readInt();
-    Lambda.IsGenericLambda = Record.readInt();
-    Lambda.CaptureDefault = Record.readInt();
-    Lambda.NumCaptures = Record.readInt();
+
+    BitsUnpacker LambdaBits(Record.readInt());
+    Lambda.DependencyKind = LambdaBits.getNextBits(/*Width=*/2);
+    Lambda.IsGenericLambda = LambdaBits.getNextBit();
+    Lambda.CaptureDefault = LambdaBits.getNextBits(/*Width=*/2);
+    Lambda.NumCaptures = LambdaBits.getNextBits(/*Width=*/15);
+    Lambda.HasKnownInternalLinkage = LambdaBits.getNextBit();
+
     Lambda.NumExplicitCaptures = Record.readInt();
-    Lambda.HasKnownInternalLinkage = Record.readInt();
     Lambda.ManglingNumber = Record.readInt();
     if (unsigned DeviceManglingNumber = Record.readInt())
       Reader.getContext().DeviceLambdaManglingNumbers[D] = DeviceManglingNumber;
@@ -1983,8 +2017,10 @@ void ASTDeclReader::ReadCXXDefinitionData(
     Lambda.MethodTyInfo = readTypeSourceInfo();
     for (unsigned I = 0, N = Lambda.NumCaptures; I != N; ++I) {
       SourceLocation Loc = readSourceLocation();
-      bool IsImplicit = Record.readInt();
-      auto Kind = static_cast<LambdaCaptureKind>(Record.readInt());
+      BitsUnpacker CaptureBits(Record.readInt());
+      bool IsImplicit = CaptureBits.getNextBit();
+      auto Kind =
+          static_cast<LambdaCaptureKind>(CaptureBits.getNextBits(/*Width=*/3));
       switch (Kind) {
       case LCK_StarThis:
       case LCK_This:
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 99358004ffd21d3..2fe12f7ff300f02 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5996,11 +5996,17 @@ void ASTRecordWriter::AddCXXCtorInitializers(
 
 void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
   auto &Data = D->data();
+
   Record->push_back(Data.IsLambda);
 
-  #define FIELD(Name, Width, Merge) \
-  Record->push_back(Data.Name);
-  #include "clang/AST/CXXRecordDeclDefinitionBits.def"
+  BitsPacker DefinitionBits;
+
+#define FIELD(Name, Width, Merge) DefinitionBits.addBits(Data.Name, Width);
+#include "clang/AST/CXXRecordDeclDefinitionBits.def"
+#undef FIELD
+
+  while (DefinitionBits.hasUnconsumedValues())
+    Record->push_back(DefinitionBits.getNextValue());
 
   // getODRHash will compute the ODRHash if it has not been previously computed.
   Record->push_back(D->getODRHash());
@@ -6032,12 +6038,16 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
     AddDeclRef(D->getFirstFriend());
   } else {
     auto &Lambda = D->getLambdaData();
-    Record->push_back(Lambda.DependencyKind);
-    Record->push_back(Lambda.IsGenericLambda);
-    Record->push_back(Lambda.CaptureDefault);
-    Record->push_back(Lambda.NumCaptures);
+
+    BitsPacker LambdaBits;
+    LambdaBits.addBits(Lambda.DependencyKind, /*Width=*/2);
+    LambdaBits.addBit(Lambda.IsGenericLambda);
+    LambdaBits.addBits(Lambda.CaptureDefault, /*Width=*/2);
+    LambdaBits.addBits(Lambda.NumCaptures, /*Width=*/15);
+    LambdaBits.addBit(Lambda.HasKnownInternalLinkage);
+    Record->push_back(LambdaBits.getNextValue());
+
     Record->push_back(Lambda.NumExplicitCaptures);
-    Record->push_back(Lambda.HasKnownInternalLinkage);
     Record->push_back(Lambda.ManglingNumber);
     Record->push_back(D->getDeviceLambdaManglingNumber());
     // The lambda context declaration and index within the context are provided
@@ -6046,8 +6056,10 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
     for (unsigned I = 0, N = Lambda.NumCaptures; I != N; ++I) {
       const LambdaCapture &Capture = Lambda.Captures.front()[I];
       AddSourceLocation(Capture.getLocation());
-      Record->push_back(Capture.isImplicit());
-      Record->push_back(Capture.getCaptureKind());
+      BitsPacker CaptureBits;
+      CaptureBits.addBit(Capture.isImplicit());
+      CaptureBits.addBits(Capture.getCaptureKind(), /*Width=*/3);
+      Record->push_back(CaptureBits);
       switch (Capture.getCaptureKind()) {
       case LCK_StarThis:
       case LCK_This:
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index ef28e69abc4e166..b4438e4cf6a0c55 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -325,16 +325,21 @@ void ASTDeclWriter::VisitDecl(Decl *D) {
     Record.AddDeclRef(cast_or_null<Decl>(D->getLexicalDeclContext()));
   else
     Record.push_back(0);
-  Record.push_back(D->isInvalidDecl());
-  Record.push_back(D->hasAttrs());
+
+  BitsPacker DeclBits;
+  DeclBits.addBit(D->isInvalidDecl());
+  DeclBits.addBit(D->hasAttrs());
+  DeclBits.addBit(D->isImplicit());
+  DeclBits.addBit(D->isUsed(false));
+  DeclBits.addBit(D->isReferenced());
+  DeclBits.addBit(D->isTopLevelDeclInObjCContainer());
+  DeclBits.addBits(D->getAccess(), /*BitWidth=*/2);
+  DeclBits.addBits((uint64_t)D->getModuleOwnershipKind(), /*BitWidth=*/3);
+  Record.push_back(DeclBits);
+
   if (D->hasAttrs())
     Record.AddAttributes(D->getAttrs());
-  Record.push_back(D->isImplicit());
-  Record.push_back(D->isUsed(false));
-  Record.push_back(D->isReferenced());
-  Record.push_back(D->isTopLevelDeclInObjCContainer());
-  Record.push_back(D->getAccess());
-  Record.push_back((uint64_t)D->getModuleOwnershipKind());
+
   Record.push_back(Writer.getSubmoduleID(D->getOwningModule()));
 
   // If this declaration injected a name into a context different from its
@@ -438,12 +443,15 @@ void ASTDeclWriter::VisitTagDecl(TagDecl *D) {
   VisitRedeclarable(D);
   VisitTypeDecl(D);
   Record.push_back(D->getIdentifierNamespace());
-  Record.push_back((unsigned)D->getTagKind()); // FIXME: stable encoding
-  if (!isa<CXXRecordDecl>(D))
-    Record.push_back(D->isCompleteDefinition());
-  Record.push_back(D->isEmbeddedInDeclarator());
-  Record.push_back(D->isFreeStanding());
-  Record.push_back(D->isCompleteDefinitionRequired());
+
+  BitsPacker TagDeclBits;
+  TagDeclBits.addBits(D->getTagKind(), /*BitWidth=*/3);
+  TagDeclBits.addBit(!isa<CXXRecordDecl>(D) ? D->isCompleteDefinition() : 0);
+  TagDeclBits.addBit(D->isEmbeddedInDeclarator());
+  TagDeclBits.addBit(D->isFreeStanding());
+  TagDeclBits.addBit(D->isCompleteDefinitionRequired());
+  Record.push_back(TagDeclBits);
+
   Record.AddSourceRange(D->getBraceRange());
 
   if (D->hasExtInfo()) {
@@ -468,11 +476,15 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
   if (!D->getIntegerTypeSourceInfo())
     Record.AddTypeRef(D->getIntegerType());
   Record.AddTypeRef(D->getPromotionType());
-  Record.push_back(D->getNumPositiveBits());
-  Record.push_back(D->getNumNegativeBits());
-  Record.push_back(D->isScoped());
-  Record.push_back(D->isScopedUsingClassTag());
-  Record.push_back(D->isFixed());
+
+  BitsPacker EnumDeclBits;
+  EnumDeclBits.addBits(D->getNumPositiveBits(), /*BitWidth=*/8);
+  EnumDeclBits.addBits(D->getNumNegativeBits(), /*BitWidth=*/8);
+  EnumDeclBits.addBit(D->isScoped());
+  EnumDeclBits.addBit(D->isScopedUsingClassTag());
+  EnumDeclBits.addBit(D->isFixed());
+  Record.push_back(EnumDeclBits);
+
   Record.push_back(D->getODRHash());
 
   if (MemberSpecializationInfo *MemberInfo = D->getMemberSpecializationInfo()) {
@@ -511,18 +523,22 @@ void ASTDeclWriter::VisitRecordDecl(RecordDecl *D) {
                 "RecordDeclBits");
 
   VisitTagDecl(D);
-  Record.push_back(D->hasFlexibleArrayMember());
-  Record.push_back(D->isAnonymousStructOrUnion());
-  Record.push_back(D->hasObjectMember());
-  Record.push_back(D->hasVolatileMember());
-  Record.push_back(D->isNonTrivialToPrimitiveDefaultInitialize());
-  Record.push_back(D->isNonTrivialToPrimitiveCopy());
-  Record.push_back(D->isNonTrivialToPrimitiveDestroy());
-  Record.push_back(D->hasNonTrivialToPrimitiveDefaultInitializeCUnion());
-  Record.push_back(D->hasNonTrivialToPrimitiveDestructCUnion());
-  Record.push_back(D->hasNonTrivialToPrimitiveCopyCUnion());
-  Record.push_back(D->isParamDestroyedInCallee());
-  Record.push_back(llvm::to_underlying(D->getArgPassingRestrictions()));
+
+  BitsPacker RecordDeclBits;
+  RecordDeclBits.addBit(D->hasFlexibleArrayMember());
+  RecordDeclBits.addBit(D->isAnonymousStructOrUnion());
+  RecordDeclBits.addBit(D->hasObjectMember());
+  RecordDeclBits.addBit(D->hasVolatileMember());
+  RecordDeclBits.addBit(D->isNonTrivialToPrimitiveDefaultInitialize());
+  RecordDeclBits.addBit(D->isNonTrivialToPrimitiveCopy());
+  RecordDeclBits.addBit(D->isNonTrivialToPrimitiveDestroy());
+  RecordDeclBits.addBit(D->hasNonTrivialToPrimitiveDefaultInitializeCUnion());
+  RecordDeclBits.addBit(D->hasNonTrivialToPrimitiveDestructCUnion());
+  RecordDeclBits.addBit(D->hasNonTrivialToPrimitiveCopyCUnion());
+  RecordDeclBits.addBit(D->isParamDestroyedInCallee());
+  RecordDeclBits.addBits(llvm::to_underlying(D->getArgPassingRestrictions()), 2);
+  Record.push_back(RecordDeclBits);
+
   // Only compute this for C/Objective-C, in C++ this is computed as part
   // of CXXRecordDecl.
   if (!isa<CXXRecordDecl>(D))
@@ -660,30 +676,31 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   Record.AddDeclarationNameLoc(D->DNLoc, D->getDeclName());
   Record.push_back(D->getIdentifierNamespace());
 
-  // FunctionDecl's body is handled last at ASTWriterDecl::Visit,
-  // after everything else is written.
-  Record.push_back(
-      static_cast<int>(D->getStorageClass())); // FIXME: stable encoding
-  Record.push_back(D->isInlineSpecified());
-  Record.push_back(D->isInlined());
-  Record.push_back(D->isVirtualAsWritten());
-  Record.push_back(D->isPure());
-  Record.push_back(D->hasInheritedPrototype());
-  Record.push_back(D->hasWrittenPrototype());
-  Record.push_back(D->isDeletedBit());
-  Record.push_back(D->isTrivial());
-  Record.push_back(D->isTrivialForCall());
-  Record.push_back(D->isDefaulted());
-  Record.push_back(D->isExplicitlyDefaulted());
-  Record.push_back(D->isIneligibleOrNotSelected());
-  Record.push_back(D->hasImplicitReturnZero());
-  Record.push_back(static_cast<uint64_t>(D->getConstexprKind()));
-  Record.push_back(D->usesSEHTry());
-  Record.push_back(D->hasSkippedBody());
-  Record.push_back(D->isMultiVersion());
-  Record.push_back(D->isLateTemplateParsed());
-  Record.push_back(D->FriendConstraintRefersToEnclosingTemplate());
-  Record.push_back(llvm::to_underlying(D->getLinkageInternal()));
+  BitsPacker FunctionDeclBits;
+  // FIXME: stable encoding
+  FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3);
+  FunctionDeclBits.addBit(D->isInlineSpecified());
+  FunctionDeclBits.addBit(D->isInlined());
+  FunctionDeclBits.addBit(D->isVirtualAsWritten());
+  FunctionDeclBits.addBit(D->isPure());
+  FunctionDeclBits.addBit(D->hasInheritedPrototype());
+  FunctionDeclBits.addBit(D->hasWrittenPrototype());
+  FunctionDeclBits.addBit(D->isDeletedBit());
+  FunctionDeclBits.addBit(D->isTrivial());
+  FunctionDeclBits.addBit(D->isTrivialForCall());
+  FunctionDeclBits.addBit(D->isDefaulted());
+  FunctionDeclBits.addBit(D->isExplicitlyDefaulted());
+  FunctionDeclBits.addBit(D->isIneligibleOrNotSelected());
+  FunctionDeclBits.addBit(D->hasImplicitReturnZero());
+  FunctionDeclBits.addBits((uint64_t)(D->getConstexprKind()), /*BitWidth=*/2);
+  FunctionDeclBits.addBit(D->usesSEHTry());
+  FunctionDeclBits.addBit(D->hasSkippedBody());
+  FunctionDeclBits.addBit(D->isMultiVersion());
+  FunctionDeclBits.addBit(D->isLateTemplateParsed());
+  FunctionDeclBits.addBit(D->FriendConstraintRefersToEnclosingTemplate());
+  FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3);
+  Record.push_back(FunctionDeclBits);
+
   Record.AddSourceLocation(D->getEndLoc());
   Record.AddSourceLocation(D->getDefaultLoc());
 
@@ -1043,38 +1060,37 @@ void ASTDeclWriter::VisitIndirectFieldDecl(IndirectFieldDecl *D) {
 void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
   VisitRedeclarable(D);
   VisitDeclaratorDecl(D);
-  Record.push_back(D->getStorageClass());
-  Record.push_back(D->getTSCSpec());
-  Record.push_back(D->getInitStyle());
-  Record.push_back(D->isARCPseudoStrong());
+
+  BitsPacker VarDeclBits;
+  VarDeclBits.addBits(D->getStorageClass(), /*BitWidth=*/3);
+  VarDeclBits.addBits(D->getTSCSpec(), /*BitWidth=*/2);
+  VarDeclBits.addBits(D->getInitStyle(), /*BitWidth=*/2);
+  VarDeclBits.addBit(D->isARCPseudoStrong());
+
   bool HasDeducedType = false;
   if (!isa<ParmVarDecl>(D)) {
-    Record.push_back(D->isThisDeclarationADemotedDefinition());
-    Record.push_back(D->isExceptionVariable());
-    Record.push_back(D->isNRVOVariable());
-    Record.push_back(D->isCXXForRangeDecl());
-    Record.push_back(D->isObjCForDecl());
-    Record.push_back(D->isInline());
-    Record.push_back(D->isInlineSpecified());
-    Record.push_back(D->isConstexpr());
-    Record.push_back(D->isInitCapture());
-    Record.push_back(D->isPreviousDeclInSameBlockScope());
+    VarDeclBits.addBit(D->isThisDeclarationADemotedDefinition());
+    VarDeclBits.addBit(D->isExceptionVariable());
+    VarDeclBits.addBit(D->isNRVOVariable());
+    VarDeclBits.addBit(D->isCXXForRangeDecl());
+    VarDeclBits.addBit(D->isObjCForDecl());
+    VarDeclBits.addBit(D->isInline());
+    VarDeclBits.addBit(D->isInlineSpecified());
+    VarDeclBits.addBit(D->isConstexpr());
+    VarDeclBits.addBit(D->isInitCapture());
+    VarDeclBits.addBit(D->isPreviousDeclInSameBlockScope());
+
     if (const auto *IPD = dyn_cast<ImplicitParamDecl>(D))
-      Record.push_back(static_cast<unsigned>(IPD->getParameterKind()));
+      VarDeclBits.addBits(IPD->getParameterKind(), /*Width=*/3);
     else
-      Record.push_back(0);
-    Record.push_back(D->isEscapingByref());
+      VarDeclBits.addBits(0, /*Width=*/3);
+
+    VarDeclBits.addBit(D->isEscapingByref());
     HasDeducedType = D->getType()->getContainedDeducedType();
-    Record.push_back(HasDeducedType);
+    VarDeclBits.addBit(HasDeducedType);
   }
-  Record.push_back(llvm::to_underlying(D->getLinkageInternal()));
 
-  if (D->hasAttr<BlocksAttr>()) {
-    BlockVarCopyInit Init = Writer.Context->getBlockVarCopyInit(D);
-    Record.AddStmt(Init.getCopyExpr());
-    if (Init.getCopyExpr())
-      Record.push_back(Init.canThrow());
-  }
+  VarDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), /*BitWidth=*/3);
 
   bool ModulesCodegen = false;
   if (Writer.WritingModule && D->getStorageDuration() == SD_Static &&
@@ -1089,10 +1105,20 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
           Writer.Context->getLangOpts().BuildingPCHWithObjectFile)) &&
          Writer.Context->GetGVALinkageForVariable(D) >= GVA_StrongExternal;
   }
-  Record.push_back(ModulesCodegen);
+
+  VarDeclBits.addBit(ModulesCodegen);
+  Record.push_back(VarDeclBits);
+
   if (ModulesCodegen)
     Writer.ModularCodegenDecls.push_back(Writer.GetDeclRef(D));
 
+  if (D->hasAttr<BlocksAttr>()) {
+    BlockVarCopyInit Init = Writer.Context->getBlockVarCopyInit(D);
+    Record.AddStmt(Init.getCopyExpr());
+    if (Init.getCopyExpr())
+      Record.push_back(Init.canThrow());
+  }
+
   enum {
     VarNotTemplate = 0, VarTemplate, StaticDataMemberSpecialization
   };
@@ -1144,13 +1170,17 @@ void ASTDeclWriter::VisitImplicitParamDecl(ImplicitParamDecl *D) {
 
 void ASTDeclWriter::VisitParmVarDecl(ParmVarDecl *D) {
   VisitVarDecl(D);
-  Record.push_back(D->isObjCMethodParameter());
-  Record.push_back(D->getFunctionScopeDepth());
-  Record.push_back(D->getFunctionScopeIndex());
+
+  BitsPacker ParmVarDeclBits;
+  ParmVarDeclBits.addBit(D->isObjCMethodParameter());
+  ParmVarDeclBits.addBits(D->getFunctionScopeDepth(), /*BitsWidth=*/7);
+  ParmVarDeclBits.addBits(D->getFunctionScopeIndex(), /*BitsWidth=*/8);
+  ParmVarDeclBits.addBit(D->isKNRPromoted());
+  ParmVarDeclBits.addBit(D->hasInheritedDefaultArg());
+  ParmVarDeclBits.addBit(D->hasUninstantiatedDefaultArg());
+  Record.push_back(ParmVarDeclBits);
+
   Record.push_back(D->getObjCDeclQualifier()); // FIXME: stable encoding
-  Record.push_back(D->isKNRPromoted());
-  Record.push_back(D->hasInheritedDefaultArg());
-  Record.push_back(D->hasUninstantiatedDefaultArg());
   if (D->hasUninstantiatedDefaultArg())
     Record.AddStmt(D->getUninstantiatedDefaultArg());
   Record.AddSourceLocation(D->getExplicitObjectParamThisLoc());
@@ -1295,8 +1325,12 @@ void ASTDeclWriter::VisitLabelDecl(LabelDecl *D) {
 void ASTDeclWriter::VisitNamespaceDecl(NamespaceDecl *D) {
   VisitRedeclarable(D);
   VisitNamedDecl(D);
-  Record.push_back(D->isInline());
-  Record.push_back(D->isNested());
+
+  BitsPacker NamespaceDeclBits;
+  NamespaceDeclBits.addBit(D->isInline());
+  NamespaceDeclBits.addBit(D->isNested());
+  Record.push_back(NamespaceDeclBits);
+
   Record.AddSourceLocation(D->getBeginLoc());
   Record.AddSourceLocation(D->getRBraceLoc());
 
@@ -2005,14 +2039,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(0));                       // LexicalDeclContext
-  Abv->Add(BitCodeAbbrevOp(0));                       // isInvalidDecl
-  Abv->Add(BitCodeAbbrevOp(0));                       // HasAttrs
-  Abv->Add(BitCodeAbbrevOp(0));                       // isImplicit
-  Abv->Add(BitCodeAbbrevOp(0));                       // isUsed
-  Abv->Add(BitCodeAbbrevOp(0));                       // isReferenced
-  Abv->Add(BitCodeAbbrevOp(0));                   // TopLevelDeclInObjCContainer
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2));  // AccessSpecifier
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3));  // ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      11)); // Packed DeclBits: isInvalidDecl, HasAttrs, isImplicit, isUsed,
+            // isReferenced, TopLevelDeclInObjCContainer, AccessSpecifier,
+            // ModuleOwnershipKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
   Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
@@ -2038,14 +2069,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(0));                       // LexicalDeclContext
-  Abv->Add(BitCodeAbbrevOp(0));                       // isInvalidDecl
-  Abv->Add(BitCodeAbbrevOp(0));                       // HasAttrs
-  Abv->Add(BitCodeAbbrevOp(0));                       // isImplicit
-  Abv->Add(BitCodeAbbrevOp(0));                       // isUsed
-  Abv->Add(BitCodeAbbrevOp(0));                       // isReferenced
-  Abv->Add(BitCodeAbbrevOp(0));                   // TopLevelDeclInObjCContainer
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2));  // AccessSpecifier
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3));  // ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      11)); // Packed DeclBits: isInvalidDecl, HasAttrs, isImplicit, isUsed,
+            // isReferenced, TopLevelDeclInObjCContainer, AccessSpecifier,
+            // ModuleOwnershipKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
   Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
@@ -2076,14 +2104,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(0));                       // LexicalDeclContext
-  Abv->Add(BitCodeAbbrevOp(0));                       // isInvalidDecl
-  Abv->Add(BitCodeAbbrevOp(0));                       // HasAttrs
-  Abv->Add(BitCodeAbbrevOp(0));                       // isImplicit
-  Abv->Add(BitCodeAbbrevOp(0));                       // isUsed
-  Abv->Add(BitCodeAbbrevOp(0));                       // isReferenced
-  Abv->Add(BitCodeAbbrevOp(0));                   // TopLevelDeclInObjCContainer
-  Abv->Add(BitCodeAbbrevOp(AS_none));                 // C++ AccessSpecifier
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      11)); // Packed DeclBits: isInvalidDecl, HasAttrs, isImplicit, isUsed,
+            // isReferenced, TopLevelDeclInObjCContainer, AccessSpecifier,
+            // ModuleOwnershipKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
   Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
@@ -2094,11 +2119,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type Ref
   // TagDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // IdentifierNamespace
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // getTagKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isCompleteDefinition
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // EmbeddedInDeclarator
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsFreeStanding
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsCompleteDefinitionRequired
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      7)); // Packed Tag Decl Bits: getTagKind, isCompleteDefinition,
+           // EmbeddedInDeclarator, IsFreeStanding, isCompleteDefinitionRequired
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // SourceLocation
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // SourceLocation
   Abv->Add(BitCodeAbbrevOp(0));                         // ExtInfoKind
@@ -2106,11 +2130,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // AddTypeRef
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // IntegerType
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // getPromotionType
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // getNumPositiveBits
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // getNumNegativeBits
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isScoped
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isScopedUsingClassTag
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isFixed
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 19)); // Enum Decl Bits
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));// ODRHash
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // InstantiatedMembEnum
   // DC
@@ -2126,14 +2146,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(0));                       // LexicalDeclContext
-  Abv->Add(BitCodeAbbrevOp(0));                       // isInvalidDecl
-  Abv->Add(BitCodeAbbrevOp(0));                       // HasAttrs
-  Abv->Add(BitCodeAbbrevOp(0));                       // isImplicit
-  Abv->Add(BitCodeAbbrevOp(0));                       // isUsed
-  Abv->Add(BitCodeAbbrevOp(0));                       // isReferenced
-  Abv->Add(BitCodeAbbrevOp(0));                   // TopLevelDeclInObjCContainer
-  Abv->Add(BitCodeAbbrevOp(AS_none));                 // C++ AccessSpecifier
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      11)); // Packed DeclBits: isInvalidDecl, HasAttrs, isImplicit, isUsed,
+            // isReferenced, TopLevelDeclInObjCContainer, AccessSpecifier,
+            // ModuleOwnershipKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
   Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
@@ -2144,36 +2161,24 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type Ref
   // TagDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // IdentifierNamespace
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // getTagKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isCompleteDefinition
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // EmbeddedInDeclarator
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsFreeStanding
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsCompleteDefinitionRequired
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      7)); // Packed Tag Decl Bits: getTagKind, isCompleteDefinition,
+           // EmbeddedInDeclarator, IsFreeStanding, isCompleteDefinitionRequired
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // SourceLocation
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // SourceLocation
   Abv->Add(BitCodeAbbrevOp(0));                         // ExtInfoKind
   // RecordDecl
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // FlexibleArrayMember
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // AnonymousStructUnion
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // hasObjectMember
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // hasVolatileMember
-
-  // isNonTrivialToPrimitiveDefaultInitialize
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-  // isNonTrivialToPrimitiveCopy
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-  // isNonTrivialToPrimitiveDestroy
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-  // hasNonTrivialToPrimitiveDefaultInitializeCUnion
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-  // hasNonTrivialToPrimitiveDestructCUnion
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-  // hasNonTrivialToPrimitiveCopyCUnion
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-  // isParamDestroyedInCallee
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
-  // getArgPassingRestrictions
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2));
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      13)); // Packed Record Decl Bits: FlexibleArrayMember,
+            // AnonymousStructUnion, hasObjectMember, hasVolatileMember,
+            // isNonTrivialToPrimitiveDefaultInitialize,
+            // isNonTrivialToPrimitiveCopy, isNonTrivialToPrimitiveDestroy,
+            // hasNonTrivialToPrimitiveDefaultInitializeCUnion,
+            // hasNonTrivialToPrimitiveDestructCUnion,
+            // hasNonTrivialToPrimitiveCopyCUnion, isParamDestroyedInCallee,
+            // getArgPassingRestrictions
   // ODRHash
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 26));
 
@@ -2190,14 +2195,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(0));                       // LexicalDeclContext
-  Abv->Add(BitCodeAbbrevOp(0));                       // isInvalidDecl
-  Abv->Add(BitCodeAbbrevOp(0));                       // HasAttrs
-  Abv->Add(BitCodeAbbrevOp(0));                       // isImplicit
-  Abv->Add(BitCodeAbbrevOp(0));                       // isUsed
-  Abv->Add(BitCodeAbbrevOp(0));                       // isReferenced
-  Abv->Add(BitCodeAbbrevOp(0));                   // TopLevelDeclInObjCContainer
-  Abv->Add(BitCodeAbbrevOp(AS_none));                 // C++ AccessSpecifier
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      11)); // Packed DeclBits: isInvalidDecl, HasAttrs, isImplicit, isUsed,
+            // isReferenced, TopLevelDeclInObjCContainer, AccessSpecifier,
+            // ModuleOwnershipKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
   Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
@@ -2210,20 +2212,17 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // hasExtInfo
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TSIType
   // VarDecl
-  Abv->Add(BitCodeAbbrevOp(0));                       // SClass
-  Abv->Add(BitCodeAbbrevOp(0));                       // TSCSpec
-  Abv->Add(BitCodeAbbrevOp(0));                       // InitStyle
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isARCPseudoStrong
-  Abv->Add(BitCodeAbbrevOp(1));                         // Linkage::None
-  Abv->Add(BitCodeAbbrevOp(0));                       // ModulesCodegen
-  Abv->Add(BitCodeAbbrevOp(0));                       // VarKind (local enum)
+  Abv->Add(
+      BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                      12)); // Packed Var Decl bits: SClass, TSCSpec, InitStyle,
+                            // isARCPseudoStrong, Linkage, ModulesCodegen
+  Abv->Add(BitCodeAbbrevOp(0));                          // VarKind (local enum)
   // ParmVarDecl
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsObjCMethodParameter
-  Abv->Add(BitCodeAbbrevOp(0));                       // ScopeDepth
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ScopeIndex
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      19)); // Packed Parm Var Decl bits: IsObjCMethodParameter, ScopeDepth,
+            // ScopeIndex, KNRPromoted, HasInheritedDefaultArg
   Abv->Add(BitCodeAbbrevOp(0));                       // ObjCDeclQualifier
-  Abv->Add(BitCodeAbbrevOp(0));                       // KNRPromoted
-  Abv->Add(BitCodeAbbrevOp(0));                       // HasInheritedDefaultArg
   Abv->Add(BitCodeAbbrevOp(0));                   // HasUninstantiatedDefaultArg
   // Type Source Info
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -2238,14 +2237,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(0));                       // LexicalDeclContext
-  Abv->Add(BitCodeAbbrevOp(0));                       // isInvalidDecl
-  Abv->Add(BitCodeAbbrevOp(0));                       // HasAttrs
-  Abv->Add(BitCodeAbbrevOp(0));                       // isImplicit
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isUsed
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isReferenced
-  Abv->Add(BitCodeAbbrevOp(0));                   // TopLevelDeclInObjCContainer
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2)); // C++ AccessSpecifier
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      11)); // Packed DeclBits: isInvalidDecl, HasAttrs, isImplicit, isUsed,
+            // isReferenced, TopLevelDeclInObjCContainer, AccessSpecifier,
+            // ModuleOwnershipKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
   Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
@@ -2267,14 +2263,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(0));                       // LexicalDeclContext
-  Abv->Add(BitCodeAbbrevOp(0));                       // isInvalidDecl
-  Abv->Add(BitCodeAbbrevOp(0));                       // HasAttrs
-  Abv->Add(BitCodeAbbrevOp(0));                       // isImplicit
-  Abv->Add(BitCodeAbbrevOp(0));                       // isUsed
-  Abv->Add(BitCodeAbbrevOp(0));                       // isReferenced
-  Abv->Add(BitCodeAbbrevOp(0));                   // TopLevelDeclInObjCContainer
-  Abv->Add(BitCodeAbbrevOp(AS_none));                 // C++ AccessSpecifier
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      11)); // Packed DeclBits: isInvalidDecl, HasAttrs, isImplicit, isUsed,
+            // isReferenced, TopLevelDeclInObjCContainer, AccessSpecifier,
+            // ModuleOwnershipKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
   Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
@@ -2287,25 +2280,14 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // hasExtInfo
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TSIType
   // VarDecl
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // SClass
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2)); // TSCSpec
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2)); // InitStyle
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isARCPseudoStrong
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsThisDeclarationADemotedDefinition
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isExceptionVariable
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isNRVOVariable
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isCXXForRangeDecl
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isObjCForDecl
-  Abv->Add(BitCodeAbbrevOp(0));                         // isInline
-  Abv->Add(BitCodeAbbrevOp(0));                         // isInlineSpecified
-  Abv->Add(BitCodeAbbrevOp(0));                         // isConstexpr
-  Abv->Add(BitCodeAbbrevOp(0));                         // isInitCapture
-  Abv->Add(BitCodeAbbrevOp(0));                         // isPrevDeclInSameScope
-  Abv->Add(BitCodeAbbrevOp(0));                         // ImplicitParamKind
-  Abv->Add(BitCodeAbbrevOp(0));                         // EscapingByref
-  Abv->Add(BitCodeAbbrevOp(0));                         // HasDeducedType
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // Linkage
-  Abv->Add(BitCodeAbbrevOp(0));                         // ModulesCodeGen
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      27)); // Packed Var Decl bits: SClass, TSCSpec, InitStyle,
+            // isARCPseudoStrong, IsThisDeclarationADemotedDefinition,
+            // isExceptionVariable, isNRVOVariable, isCXXForRangeDecl,
+            // isObjCForDecl, isInline, isInlineSpecified, isConstexpr,
+            // isInitCapture, isPrevDeclInSameScope, ImplicitParamKind,
+            // EscapingByref, HasDeducedType, Linkage, ModulesCodegen
   Abv->Add(BitCodeAbbrevOp(0));                         // VarKind (local enum)
   // Type Source Info
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -2322,14 +2304,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // DeclContext
   Abv->Add(BitCodeAbbrevOp(0));                         // LexicalDeclContext
-  Abv->Add(BitCodeAbbrevOp(0));                         // Invalid
-  Abv->Add(BitCodeAbbrevOp(0));                         // HasAttrs
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Implicit
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Used
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Referenced
-  Abv->Add(BitCodeAbbrevOp(0));                         // InObjCContainer
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2)); // Access
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      11)); // Packed DeclBits: isInvalidDecl, HasAttrs, isImplicit, isUsed,
+            // isReferenced, TopLevelDeclInObjCContainer, AccessSpecifier,
+            // ModuleOwnershipKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // SubmoduleID
   // NamedDecl
   Abv->Add(BitCodeAbbrevOp(DeclarationName::Identifier)); // NameKind
@@ -2343,27 +2322,14 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // TSIType
   // FunctionDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 11)); // IDNS
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // StorageClass
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Inline
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // InlineSpecified
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // VirtualAsWritten
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Pure
-  Abv->Add(BitCodeAbbrevOp(0));                         // HasInheritedProto
-  Abv->Add(BitCodeAbbrevOp(1));                         // HasWrittenProto
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Deleted
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Trivial
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // TrivialForCall
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Defaulted
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ExplicitlyDefaulted
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsIneligibleOrNotSelected
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ImplicitReturnZero
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2)); // Constexpr
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // UsesSEHTry
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // SkippedBody
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // MultiVersion
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // LateParsed
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // FriendConstraintRefersToEnclosingTemplate
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // Linkage
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      27)); // Packed Function Bits: StorageClass, Inline, InlineSpecified,
+            // VirtualAsWritten, Pure, HasInheritedProto, HasWrittenProto,
+            // Deleted, Trivial, TrivialForCall, Defaulted, ExplicitlyDefaulted,
+            // IsIneligibleOrNotSelected, ImplicitReturnZero, Constexpr,
+            // UsesSEHTry, SkippedBody, MultiVersion, LateParsed,
+            // FriendConstraintRefersToEnclosingTemplate, Linkage
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LocEnd
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Default
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // ODRHash
diff --git a/clang/test/Modules/decl-params-determinisim.m b/clang/test/Modules/decl-params-determinisim.m
index d7e873bb4938e2c..9eedf0a206423b2 100644
--- a/clang/test/Modules/decl-params-determinisim.m
+++ b/clang/test/Modules/decl-params-determinisim.m
@@ -22,23 +22,23 @@
 /// op13 encodes the anonymous decl number which should be in order.
 // CHECK: <TYPE_FUNCTION_PROTO
 // CHECK-NEXT: <DECL_PARM_VAR
-// CHECK-SAME: op13=2
+// CHECK-SAME: op11=4024
 // CHECK-NEXT: <DECL_PARM_VAR
-// CHECK-SAME: op13=3
+// CHECK-SAME: op11=4032
 // CHECK-NEXT: <DECL_PARM_VAR
-// CHECK-SAME: op13=4
+// CHECK-SAME: op11=4040
 // CHECK-NEXT: <DECL_PARM_VAR
-// CHECK-SAME: op13=5
+// CHECK-SAME: op11=4048
 
 /// Decl records start at 43
 // CHECK: <DECL_RECORD
-// CHECK-SAME: op13=43
+// CHECK-SAME: op9=4352
 // CHECK-NEXT: <DECL_RECORD
-// CHECK-SAME: op13=44
+// CHECK-SAME: op9=4360
 // CHECK-NEXT: <DECL_RECORD
-// CHECK-SAME: op13=45
+// CHECK-SAME: op9=4368
 // CHECK-NEXT: <DECL_RECORD
-// CHECK-SAME: op13=46
+// CHECK-SAME: op9=4376
 
 //--- headers/a.h
 void f(struct A0 *a0,

>From ee9220c8e3fe3d738b0e84f706b789326d912937 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 15:04:47 +0100
Subject: [PATCH 30/76] [PatternMatch] Don't try to match sext/zext const exprs
 (NFCI)

---
 llvm/include/llvm/IR/PatternMatch.h | 82 +++++++++++++++++------------
 1 file changed, 49 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 13877538f79de6d..2551e81b62b6d0d 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1576,10 +1576,10 @@ m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp) {
 // Matchers for CastInst classes
 //
 
-template <typename Op_t, unsigned Opcode> struct CastClass_match {
+template <typename Op_t, unsigned Opcode> struct CastOperator_match {
   Op_t Op;
 
-  CastClass_match(const Op_t &OpMatch) : Op(OpMatch) {}
+  CastOperator_match(const Op_t &OpMatch) : Op(OpMatch) {}
 
   template <typename OpTy> bool match(OpTy *V) {
     if (auto *O = dyn_cast<Operator>(V))
@@ -1588,6 +1588,18 @@ template <typename Op_t, unsigned Opcode> struct CastClass_match {
   }
 };
 
+template <typename Op_t, unsigned Opcode> struct CastInst_match {
+  Op_t Op;
+
+  CastInst_match(const Op_t &OpMatch) : Op(OpMatch) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *I = dyn_cast<Instruction>(V))
+      return I->getOpcode() == Opcode && Op.match(I->getOperand(0));
+    return false;
+  }
+};
+
 template <typename Op_t> struct PtrToIntSameSize_match {
   const DataLayout &DL;
   Op_t Op;
@@ -1607,14 +1619,16 @@ template <typename Op_t> struct PtrToIntSameSize_match {
 
 /// Matches BitCast.
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::BitCast> m_BitCast(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::BitCast>(Op);
+inline CastOperator_match<OpTy, Instruction::BitCast>
+m_BitCast(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::BitCast>(Op);
 }
 
 /// Matches PtrToInt.
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::PtrToInt> m_PtrToInt(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::PtrToInt>(Op);
+inline CastOperator_match<OpTy, Instruction::PtrToInt>
+m_PtrToInt(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::PtrToInt>(Op);
 }
 
 template <typename OpTy>
@@ -1625,90 +1639,92 @@ inline PtrToIntSameSize_match<OpTy> m_PtrToIntSameSize(const DataLayout &DL,
 
 /// Matches IntToPtr.
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::IntToPtr> m_IntToPtr(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::IntToPtr>(Op);
+inline CastOperator_match<OpTy, Instruction::IntToPtr>
+m_IntToPtr(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::IntToPtr>(Op);
 }
 
 /// Matches Trunc.
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::Trunc> m_Trunc(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::Trunc>(Op);
+inline CastOperator_match<OpTy, Instruction::Trunc> m_Trunc(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::Trunc>(Op);
 }
 
 template <typename OpTy>
-inline match_combine_or<CastClass_match<OpTy, Instruction::Trunc>, OpTy>
+inline match_combine_or<CastOperator_match<OpTy, Instruction::Trunc>, OpTy>
 m_TruncOrSelf(const OpTy &Op) {
   return m_CombineOr(m_Trunc(Op), Op);
 }
 
 /// Matches SExt.
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::SExt> m_SExt(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::SExt>(Op);
+inline CastInst_match<OpTy, Instruction::SExt> m_SExt(const OpTy &Op) {
+  return CastInst_match<OpTy, Instruction::SExt>(Op);
 }
 
 /// Matches ZExt.
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::ZExt> m_ZExt(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::ZExt>(Op);
+inline CastInst_match<OpTy, Instruction::ZExt> m_ZExt(const OpTy &Op) {
+  return CastInst_match<OpTy, Instruction::ZExt>(Op);
 }
 
 template <typename OpTy>
-inline match_combine_or<CastClass_match<OpTy, Instruction::ZExt>, OpTy>
+inline match_combine_or<CastInst_match<OpTy, Instruction::ZExt>, OpTy>
 m_ZExtOrSelf(const OpTy &Op) {
   return m_CombineOr(m_ZExt(Op), Op);
 }
 
 template <typename OpTy>
-inline match_combine_or<CastClass_match<OpTy, Instruction::SExt>, OpTy>
+inline match_combine_or<CastInst_match<OpTy, Instruction::SExt>, OpTy>
 m_SExtOrSelf(const OpTy &Op) {
   return m_CombineOr(m_SExt(Op), Op);
 }
 
 template <typename OpTy>
-inline match_combine_or<CastClass_match<OpTy, Instruction::ZExt>,
-                        CastClass_match<OpTy, Instruction::SExt>>
+inline match_combine_or<CastInst_match<OpTy, Instruction::ZExt>,
+                        CastInst_match<OpTy, Instruction::SExt>>
 m_ZExtOrSExt(const OpTy &Op) {
   return m_CombineOr(m_ZExt(Op), m_SExt(Op));
 }
 
 template <typename OpTy>
 inline match_combine_or<
-    match_combine_or<CastClass_match<OpTy, Instruction::ZExt>,
-                     CastClass_match<OpTy, Instruction::SExt>>,
+    match_combine_or<CastInst_match<OpTy, Instruction::ZExt>,
+                     CastInst_match<OpTy, Instruction::SExt>>,
     OpTy>
 m_ZExtOrSExtOrSelf(const OpTy &Op) {
   return m_CombineOr(m_ZExtOrSExt(Op), Op);
 }
 
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::UIToFP> m_UIToFP(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::UIToFP>(Op);
+inline CastOperator_match<OpTy, Instruction::UIToFP> m_UIToFP(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::UIToFP>(Op);
 }
 
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::SIToFP> m_SIToFP(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::SIToFP>(Op);
+inline CastOperator_match<OpTy, Instruction::SIToFP> m_SIToFP(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::SIToFP>(Op);
 }
 
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::FPToUI> m_FPToUI(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::FPToUI>(Op);
+inline CastOperator_match<OpTy, Instruction::FPToUI> m_FPToUI(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::FPToUI>(Op);
 }
 
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::FPToSI> m_FPToSI(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::FPToSI>(Op);
+inline CastOperator_match<OpTy, Instruction::FPToSI> m_FPToSI(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::FPToSI>(Op);
 }
 
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::FPTrunc> m_FPTrunc(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::FPTrunc>(Op);
+inline CastOperator_match<OpTy, Instruction::FPTrunc>
+m_FPTrunc(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::FPTrunc>(Op);
 }
 
 template <typename OpTy>
-inline CastClass_match<OpTy, Instruction::FPExt> m_FPExt(const OpTy &Op) {
-  return CastClass_match<OpTy, Instruction::FPExt>(Op);
+inline CastOperator_match<OpTy, Instruction::FPExt> m_FPExt(const OpTy &Op) {
+  return CastOperator_match<OpTy, Instruction::FPExt>(Op);
 }
 
 //===----------------------------------------------------------------------===//

>From 68fbc8eec38d2930b60e01ffb2cbf297eac658fb Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Fri, 3 Nov 2023 14:04:26 +0000
Subject: [PATCH 31/76] [lldb][NFC] Use UNUSED_IF_ASSERT_DISABLED instead of
 (void) cast

Uses of (void) remain where they are for purposes other than an
assert variable.
---
 lldb/source/DataFormatters/StringPrinter.cpp              | 2 +-
 lldb/source/Expression/IRInterpreter.cpp                  | 2 +-
 lldb/source/Host/common/PseudoTerminal.cpp                | 2 +-
 lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp  | 2 +-
 lldb/source/Host/posix/MainLoopPosix.cpp                  | 6 +++---
 lldb/source/Host/windows/MainLoopWindows.cpp              | 8 ++++----
 .../ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp          | 2 +-
 .../Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp       | 3 ++-
 .../Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp      | 2 +-
 lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp  | 2 +-
 .../Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp      | 2 +-
 lldb/source/Symbol/SymbolFile.cpp                         | 2 +-
 lldb/source/Utility/Log.cpp                               | 2 +-
 lldb/tools/lldb-dap/DAP.cpp                               | 2 +-
 14 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/lldb/source/DataFormatters/StringPrinter.cpp b/lldb/source/DataFormatters/StringPrinter.cpp
index 4b57e87b4ccdcd9..ab07c74fd1854ba 100644
--- a/lldb/source/DataFormatters/StringPrinter.cpp
+++ b/lldb/source/DataFormatters/StringPrinter.cpp
@@ -183,7 +183,7 @@ DecodedCharBuffer GetPrintableImpl<StringElementType::UTF8>(
       &buffer_for_conversion, buffer_end, &codepoint, llvm::strictConversion);
   assert(result == llvm::conversionOK &&
          "Failed to convert legal utf8 sequence");
-  (void)result;
+  UNUSED_IF_ASSERT_DISABLED(result);
 
   // The UTF8 helper always advances by the utf8 encoded length.
   const unsigned utf8_encoded_len = buffer_for_conversion - buffer;
diff --git a/lldb/source/Expression/IRInterpreter.cpp b/lldb/source/Expression/IRInterpreter.cpp
index 10457f51a95d8b7..df0292270866397 100644
--- a/lldb/source/Expression/IRInterpreter.cpp
+++ b/lldb/source/Expression/IRInterpreter.cpp
@@ -1509,7 +1509,7 @@ bool IRInterpreter::Interpret(llvm::Module &module, llvm::Function &function,
           size_t dataSize = 0;
 
           bool Success = execution_unit.GetAllocSize(addr, dataSize);
-          (void)Success;
+          UNUSED_IF_ASSERT_DISABLED(Success);
           assert(Success &&
                  "unable to locate host data for transfer to device");
           // Create the required buffer
diff --git a/lldb/source/Host/common/PseudoTerminal.cpp b/lldb/source/Host/common/PseudoTerminal.cpp
index de49058beeb703e..d53327973eb2705 100644
--- a/lldb/source/Host/common/PseudoTerminal.cpp
+++ b/lldb/source/Host/common/PseudoTerminal.cpp
@@ -123,7 +123,7 @@ std::string PseudoTerminal::GetSecondaryName() const {
     char buf[PATH_MAX];
     buf[0] = '\0';
     int r = ptsname_r(m_primary_fd, buf, sizeof(buf));
-    (void)r;
+    UNUSED_IF_ASSERT_DISABLED(r);
     assert(r == 0);
     return buf;
 #if defined(__APPLE__)
diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index 6a367a3307e543f..fceeff08ed9d36f 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -510,7 +510,7 @@ ConnectionFileDescriptor::BytesAvailable(const Timeout<std::micro> &timeout,
           ssize_t bytes_read =
               llvm::sys::RetryAfterSignal(-1, ::read, pipe_fd, &c, 1);
           assert(bytes_read == 1);
-          (void)bytes_read;
+          UNUSED_IF_ASSERT_DISABLED(bytes_read);
           switch (c) {
           case 'q':
             LLDB_LOGF(log,
diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp
index 5b50b450433ea86..5fe4d015251c8a6 100644
--- a/lldb/source/Host/posix/MainLoopPosix.cpp
+++ b/lldb/source/Host/posix/MainLoopPosix.cpp
@@ -122,7 +122,7 @@ sigset_t MainLoopPosix::RunImpl::get_sigmask() {
   sigset_t sigmask;
   int ret = pthread_sigmask(SIG_SETMASK, nullptr, &sigmask);
   assert(ret == 0);
-  (void)ret;
+  UNUSED_IF_ASSERT_DISABLED(ret);
 
   for (const auto &sig : loop.m_signals)
     sigdelset(&sigmask, sig.first);
@@ -299,7 +299,7 @@ MainLoopPosix::RegisterSignal(int signo, const Callback &callback,
   // Even if using kqueue, the signal handler will still be invoked, so it's
   // important to replace it with our "benign" handler.
   int ret = sigaction(signo, &new_action, &info.old_action);
-  (void)ret;
+  UNUSED_IF_ASSERT_DISABLED(ret);
   assert(ret == 0 && "sigaction failed");
 
 #if HAVE_SYS_EVENT_H
@@ -346,7 +346,7 @@ void MainLoopPosix::UnregisterSignal(
   int ret = pthread_sigmask(it->second.was_blocked ? SIG_BLOCK : SIG_UNBLOCK,
                             &set, nullptr);
   assert(ret == 0);
-  (void)ret;
+  UNUSED_IF_ASSERT_DISABLED(ret);
 
 #if HAVE_SYS_EVENT_H
   struct kevent ev;
diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp
index 6e5f45d98208e16..25ea305c9976358 100644
--- a/lldb/source/Host/windows/MainLoopWindows.cpp
+++ b/lldb/source/Host/windows/MainLoopWindows.cpp
@@ -30,7 +30,7 @@ MainLoopWindows::~MainLoopWindows() {
   assert(m_read_fds.empty());
   BOOL result = WSACloseEvent(m_trigger_event);
   assert(result == TRUE);
-  (void)result;
+  UNUSED_IF_ASSERT_DISABLED(result);
 }
 
 llvm::Expected<size_t> MainLoopWindows::Poll() {
@@ -39,7 +39,7 @@ llvm::Expected<size_t> MainLoopWindows::Poll() {
   for (auto &[fd, info] : m_read_fds) {
     int result = WSAEventSelect(fd, info.event, FD_READ | FD_ACCEPT | FD_CLOSE);
     assert(result == 0);
-    (void)result;
+    UNUSED_IF_ASSERT_DISABLED(result);
 
     events.push_back(info.event);
   }
@@ -51,7 +51,7 @@ llvm::Expected<size_t> MainLoopWindows::Poll() {
   for (auto &fd : m_read_fds) {
     int result = WSAEventSelect(fd.first, WSA_INVALID_EVENT, 0);
     assert(result == 0);
-    (void)result;
+    UNUSED_IF_ASSERT_DISABLED(result);
   }
 
   if (result >= WSA_WAIT_EVENT_0 && result <= WSA_WAIT_EVENT_0 + events.size())
@@ -99,7 +99,7 @@ void MainLoopWindows::UnregisterReadObject(IOObject::WaitableHandle handle) {
   assert(it != m_read_fds.end());
   BOOL result = WSACloseEvent(it->second.event);
   assert(result == TRUE);
-  (void)result;
+  UNUSED_IF_ASSERT_DISABLED(result);
   m_read_fds.erase(it);
 }
 
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp
index 65f00f5e4d0220d..93168c23f3547b6 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV1.cpp
@@ -161,7 +161,7 @@ AppleObjCRuntimeV1::CreateObjectChecker(std::string name,
                "           \n",
                name.c_str());
   assert(strformatsize < (int)sizeof(buf->contents));
-  (void)strformatsize;
+  UNUSED_IF_ASSERT_DISABLED(strformatsize);
 
   return GetTargetRef().CreateUtilityFunction(buf->contents, std::move(name),
                                               eLanguageTypeC, exe_ctx);
diff --git a/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp b/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp
index b0afe0394622075..d40f87b1a7b423c 100644
--- a/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp
+++ b/lldb/source/Plugins/ObjectFile/Breakpad/BreakpadRecords.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Plugins/ObjectFile/Breakpad/BreakpadRecords.h"
+#include "lldb/lldb-defines.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Endian.h"
@@ -119,7 +120,7 @@ static UUID parseModuleId(llvm::Triple::OSType os, llvm::StringRef str) {
   uint32_t age;
   bool success = to_integer(age_str, age, 16);
   assert(success);
-  (void)success;
+  UNUSED_IF_ASSERT_DISABLED(success);
   data.age = age;
 
   // On non-windows, the age field should always be zero, so we don't include to
diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
index 84a009be50bf48f..19e0986ace31ff6 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
@@ -69,7 +69,7 @@ NativeProcessFreeBSD::Manager::Launch(ProcessLaunchInfo &launch_info,
   int wstatus;
   ::pid_t wpid = llvm::sys::RetryAfterSignal(-1, ::waitpid, pid, &wstatus, 0);
   assert(wpid == pid);
-  (void)wpid;
+  UNUSED_IF_ASSERT_DISABLED(wpid);
   if (!WIFSTOPPED(wstatus)) {
     LLDB_LOG(log, "Could not sync with inferior process: wstatus={1}",
              WaitStatus::Decode(wstatus));
diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
index aac3bc847dd051f..5d2b4b03fe60cb8 100644
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -281,7 +281,7 @@ NativeProcessLinux::Manager::Launch(ProcessLaunchInfo &launch_info,
   int wstatus = 0;
   ::pid_t wpid = llvm::sys::RetryAfterSignal(-1, ::waitpid, pid, &wstatus, 0);
   assert(wpid == pid);
-  (void)wpid;
+  UNUSED_IF_ASSERT_DISABLED(wpid);
   if (!WIFSTOPPED(wstatus)) {
     LLDB_LOG(log, "Could not sync with inferior process: wstatus={1}",
              WaitStatus::Decode(wstatus));
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 280571856035753..696708d3fc7cf5d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1762,7 +1762,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
     }
   }
   assert(tag_decl_kind != -1);
-  (void)tag_decl_kind;
+  UNUSED_IF_ASSERT_DISABLED(tag_decl_kind);
   bool clang_type_was_created = false;
   clang_type = CompilerType(
       m_ast.weak_from_this(),
diff --git a/lldb/source/Symbol/SymbolFile.cpp b/lldb/source/Symbol/SymbolFile.cpp
index 7dcee8ced0ea11b..4b9c3863e461590 100644
--- a/lldb/source/Symbol/SymbolFile.cpp
+++ b/lldb/source/Symbol/SymbolFile.cpp
@@ -216,7 +216,7 @@ void SymbolFileCommon::SetCompileUnitAtIndex(uint32_t idx,
   std::lock_guard<std::recursive_mutex> guard(GetModuleMutex());
   const size_t num_compile_units = GetNumCompileUnits();
   assert(idx < num_compile_units);
-  (void)num_compile_units;
+  UNUSED_IF_ASSERT_DISABLED(num_compile_units);
 
   // Fire off an assertion if this compile unit already exists for now. The
   // partial parsing should take care of only setting the compile unit
diff --git a/lldb/source/Utility/Log.cpp b/lldb/source/Utility/Log.cpp
index 75912683e2334b1..3a45a0285d3e254 100644
--- a/lldb/source/Utility/Log.cpp
+++ b/lldb/source/Utility/Log.cpp
@@ -210,7 +210,7 @@ void Log::Warning(const char *format, ...) {
 void Log::Register(llvm::StringRef name, Channel &channel) {
   auto iter = g_channel_map->try_emplace(name, channel);
   assert(iter.second == true);
-  (void)iter;
+  UNUSED_IF_ASSERT_DISABLED(iter);
 }
 
 void Log::Unregister(llvm::StringRef name) {
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 6fa0514bfe32bdf..1bff198e4ac000f 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -57,7 +57,7 @@ DAP::DAP()
   int result = _setmode(fileno(stdout), _O_BINARY);
   assert(result);
   result = _setmode(fileno(stdin), _O_BINARY);
-  (void)result;
+  UNUSED_IF_ASSERT_DISABLED(result);
   assert(result);
 #endif
   if (log_file_path)

>From 5c3beb7b1e26d38b0933a28432dfbce4e00cf329 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 15:19:04 +0100
Subject: [PATCH 32/76] [MemCpyOpt] Handle memcpy marked as memory(none)

Fixes #71183.
---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 9 ++++++---
 llvm/test/Transforms/MemCpyOpt/memcpy.ll       | 9 +++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a31dedafdcbc449..39ad3d87779b526 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1667,6 +1667,11 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
     return true;
   }
 
+  MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
+  if (!MA)
+    // Degenerate case: memcpy marked as not accessing memory.
+    return false;
+
   // If copying from a constant, try to turn the memcpy into a memset.
   if (auto *GV = dyn_cast<GlobalVariable>(M->getSource()))
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
@@ -1675,8 +1680,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         IRBuilder<> Builder(M);
         Instruction *NewM = Builder.CreateMemSet(
             M->getRawDest(), ByteVal, M->getLength(), M->getDestAlign(), false);
-        auto *LastDef =
-            cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
+        auto *LastDef = cast<MemoryDef>(MA);
         auto *NewAccess =
             MSSAU->createMemoryAccessAfter(NewM, nullptr, LastDef);
         MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
@@ -1687,7 +1691,6 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
       }
 
   BatchAAResults BAA(*AA);
-  MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
   // FIXME: Not using getClobberingMemoryAccess() here due to PR54682.
   MemoryAccess *AnyClobber = MA->getDefiningAccess();
   MemoryLocation DestLoc = MemoryLocation::getForDest(M);
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index 7488203d5db16dd..413d72a8e611558 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -723,6 +723,15 @@ define void @byval_param_noalias_metadata(ptr align 4 byval(i32) %ptr) {
   ret void
 }
 
+define void @memcpy_memory_none(ptr %p, ptr %p2, i64 %size) {
+; CHECK-LABEL: @memcpy_memory_none(
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[SIZE:%.*]], i1 false) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.p0.p0.i64(ptr %p, ptr %p2, i64 %size, i1 false) memory(none)
+  ret void
+}
+
 !0 = !{!0}
 !1 = !{!1, !0}
 !2 = !{!1}

>From a8ac6a986852fbf4befbcb056b7adcb197d90e26 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 3 Nov 2023 15:43:30 +0100
Subject: [PATCH 33/76] [SCEV] Remove newline after predicates in dump

update_analyze_test_checks.py will now insert check lines for
empty lines, which means that all the existing test coverage will
have a spurious change to check for the newline after "Predicates:".

I don't think we actually want to have that newline, so drop it
before it gets into more test coverage.
---
 llvm/lib/Analysis/ScalarEvolution.cpp                          | 3 +--
 .../Analysis/ScalarEvolution/ptrtoint-constantexpr-loop.ll     | 3 ---
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index de24aa986688a1e..3a3d41dac78574c 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -13437,9 +13437,8 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
     for (const auto *P : Preds)
       P->print(OS, 4);
   } else {
-    OS << "Unpredictable predicated backedge-taken count. ";
+    OS << "Unpredictable predicated backedge-taken count.\n";
   }
-  OS << "\n";
 
   if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
     OS << "Loop ";
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoint-constantexpr-loop.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoint-constantexpr-loop.ll
index 802ebef82d9d394..1af1cc67a7027e0 100644
--- a/llvm/test/Analysis/ScalarEvolution/ptrtoint-constantexpr-loop.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoint-constantexpr-loop.ll
@@ -353,7 +353,6 @@ define i64 @sext_like_noop(i32 %n) {
 ; PTR64_IDX64-NEXT:  Loop %for.body: symbolic max backedge-taken count is (-2 + (trunc i64 (ptrtoint ptr @sext_like_noop to i64) to i32))
 ; PTR64_IDX64-NEXT:  Loop %for.body: Predicated backedge-taken count is (-2 + (trunc i64 (ptrtoint ptr @sext_like_noop to i64) to i32))
 ; PTR64_IDX64-NEXT:   Predicates:
-; PTR64_IDX64-EMPTY:
 ; PTR64_IDX64-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 ; PTR64_IDX32-LABEL: 'sext_like_noop'
@@ -372,7 +371,6 @@ define i64 @sext_like_noop(i32 %n) {
 ; PTR64_IDX32-NEXT:  Loop %for.body: symbolic max backedge-taken count is (-2 + ptrtoint (ptr @sext_like_noop to i32))
 ; PTR64_IDX32-NEXT:  Loop %for.body: Predicated backedge-taken count is (-2 + ptrtoint (ptr @sext_like_noop to i32))
 ; PTR64_IDX32-NEXT:   Predicates:
-; PTR64_IDX32-EMPTY:
 ; PTR64_IDX32-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 ; PTR16_IDX16-LABEL: 'sext_like_noop'
@@ -391,7 +389,6 @@ define i64 @sext_like_noop(i32 %n) {
 ; PTR16_IDX16-NEXT:  Loop %for.body: symbolic max backedge-taken count is (-2 + (zext i16 (ptrtoint ptr @sext_like_noop to i16) to i32))<nsw>
 ; PTR16_IDX16-NEXT:  Loop %for.body: Predicated backedge-taken count is (-2 + (zext i16 (ptrtoint ptr @sext_like_noop to i16) to i32))<nsw>
 ; PTR16_IDX16-NEXT:   Predicates:
-; PTR16_IDX16-EMPTY:
 ; PTR16_IDX16-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 entry:

>From 4ae416793037366bbffb19663521634fb855647c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 3 Nov 2023 14:54:47 +0000
Subject: [PATCH 34/76] [ConstraintElim] Add tests that require (UGE, Variable,
 0) info.

Inspired by
https://discourse.llvm.org/t/why-does-llvm-not-perform-range-analysis-on-integer-values/74341
and https://github.com/llvm/llvm-project/issues/63490.
---
 .../reason-about-add-operands.ll              | 401 ++++++++++++++++++
 1 file changed, 401 insertions(+)
 create mode 100644 llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll

diff --git a/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll b/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll
new file mode 100644
index 000000000000000..3c95b192705a51c
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll
@@ -0,0 +1,401 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -p constraint-elimination -S %s | FileCheck %s
+
+declare void @llvm.assume(i1)
+
+define i1 @addition_with_extra_facts_and_args_ult_i64(i64 noundef %a, i64 noundef %b, i64 noundef %c) {
+; CHECK-LABEL: define i1 @addition_with_extra_facts_and_args_ult_i64(
+; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ule i64 [[A]], 2048
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_A]])
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp ule i64 [[B]], 1024
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_B]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i64 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[T:%.*]] = icmp ult i64 [[A]], [[C]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+entry:
+  %cmp.a = icmp ule i64 %a, 2048
+  call void @llvm.assume(i1 %cmp.a)
+  %cmp.b = icmp ule i64 %b, 1024
+  call void @llvm.assume(i1 %cmp.b)
+  %add = add nuw nsw i64 %b, %a
+  %cmp.add = icmp ult i64 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %t = icmp ult i64 %a, %c
+  ret i1 %t
+}
+
+define i1 @addition_with_extra_facts_and_args_ult_1(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+; CHECK-LABEL: define i1 @addition_with_extra_facts_and_args_ult_1(
+; CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ule i16 [[A]], 2048
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_A]])
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp ule i16 [[B]], 1024
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_B]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[A]], [[C]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+entry:
+  %cmp.a = icmp ule i16 %a, 2048
+  call void @llvm.assume(i1 %cmp.a)
+  %cmp.b = icmp ule i16 %b, 1024
+  call void @llvm.assume(i1 %cmp.b)
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %t = icmp ult i16 %a, %c
+  ret i1 %t
+}
+
+define i1 @addition_with_extra_facts_and_args_ult_2(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+; CHECK-LABEL: define i1 @addition_with_extra_facts_and_args_ult_2(
+; CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ule i16 [[A]], 2048
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_A]])
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp ule i16 [[B]], 1024
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_B]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[B]], [[C]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+entry:
+  %cmp.a = icmp ule i16 %a, 2048
+  call void @llvm.assume(i1 %cmp.a)
+  %cmp.b = icmp ule i16 %b, 1024
+  call void @llvm.assume(i1 %cmp.b)
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %t = icmp ult i16 %b, %c
+  ret i1 %t
+}
+
+define i1 @addition_with_extra_facts_and_args_ult_3(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+; CHECK-LABEL: define i1 @addition_with_extra_facts_and_args_ult_3(
+; CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ule i16 [[A]], 2048
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_A]])
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp ule i16 [[B]], 1024
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_B]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[A]], [[C]]
+; CHECK-NEXT:    ret i1 [[F]]
+;
+entry:
+  %cmp.a = icmp ule i16 %a, 2048
+  call void @llvm.assume(i1 %cmp.a)
+  %cmp.b = icmp ule i16 %b, 1024
+  call void @llvm.assume(i1 %cmp.b)
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %f = icmp uge i16 %a, %c
+  ret i1 %f
+}
+
+define i1 @addition_with_extra_facts_and_args_ult_4(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+; CHECK-LABEL: define i1 @addition_with_extra_facts_and_args_ult_4(
+; CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ule i16 [[A]], 2048
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_A]])
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp ule i16 [[B]], 1024
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_B]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[B]], [[C]]
+; CHECK-NEXT:    ret i1 [[F]]
+;
+entry:
+  %cmp.a = icmp ule i16 %a, 2048
+  call void @llvm.assume(i1 %cmp.a)
+  %cmp.b = icmp ule i16 %b, 1024
+  call void @llvm.assume(i1 %cmp.b)
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %f = icmp uge i16 %b, %c
+  ret i1 %f
+}
+
+define i1 @addition_with_extra_facts_and_args_ult_5(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+; CHECK-LABEL: define i1 @addition_with_extra_facts_and_args_ult_5(
+; CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ule i16 [[A]], 2048
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_A]])
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp ule i16 [[B]], 1024
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_B]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i16 [[A]], 10
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cmp.a = icmp ule i16 %a, 2048
+  call void @llvm.assume(i1 %cmp.a)
+  %cmp.b = icmp ule i16 %b, 1024
+  call void @llvm.assume(i1 %cmp.b)
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %cmp = icmp uge i16 %a, 10
+  ret i1 %cmp
+}
+
+define i1 @addition_with_extra_facts_and_args_ult_6(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+; CHECK-LABEL: define i1 @addition_with_extra_facts_and_args_ult_6(
+; CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ule i16 [[A]], 2048
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_A]])
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp ule i16 [[B]], 1024
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_B]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[A]], [[B]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %cmp.a = icmp ule i16 %a, 2048
+  call void @llvm.assume(i1 %cmp.a)
+  %cmp.b = icmp ule i16 %b, 1024
+  call void @llvm.assume(i1 %cmp.b)
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %cmp = icmp ult i16 %a, %b
+  ret i1 %cmp
+}
+
+
+declare i16 @get()
+
+define i1 @addition_with_extra_facts_and_return_value_ult_1() {
+; CHECK-LABEL: define i1 @addition_with_extra_facts_and_return_value_ult_1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[B:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[C:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ule i16 [[A]], 2048
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_A]])
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp ule i16 [[B]], 1024
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_B]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[A]], [[C]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+entry:
+  %a = call i16 @get()
+  %b = call i16 @get()
+  %c = call i16 @get()
+  %cmp.a = icmp ule i16 %a, 2048
+  call void @llvm.assume(i1 %cmp.a)
+  %cmp.b = icmp ule i16 %b, 1024
+  call void @llvm.assume(i1 %cmp.b)
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %t = icmp ult i16 %a, %c
+  ret i1 %t
+}
+
+define i1 @addition_with_extra_facts_and_return_value_ult_2() {
+; CHECK-LABEL: define i1 @addition_with_extra_facts_and_return_value_ult_2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[B:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[C:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ule i16 [[A]], 2048
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_A]])
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp ule i16 [[B]], 1024
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_B]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[A]], [[C]]
+; CHECK-NEXT:    ret i1 [[F]]
+;
+entry:
+  %a = call i16 @get()
+  %b = call i16 @get()
+  %c = call i16 @get()
+  %cmp.a = icmp ule i16 %a, 2048
+  call void @llvm.assume(i1 %cmp.a)
+  %cmp.b = icmp ule i16 %b, 1024
+  call void @llvm.assume(i1 %cmp.b)
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %f = icmp uge i16 %a, %c
+  ret i1 %f
+}
+
+define i1 @addition_no_extra_facts_with_return_value_ult_1() {
+; CHECK-LABEL: define i1 @addition_no_extra_facts_with_return_value_ult_1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[B:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[C:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[A]], [[C]]
+; CHECK-NEXT:    ret i1 [[T]]
+;
+entry:
+  %a = call i16 @get()
+  %b = call i16 @get()
+  %c = call i16 @get()
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %t = icmp ult i16 %a, %c
+  ret i1 %t
+}
+
+define i1 @addition_no_extra_facts_with_return_value_ult_2() {
+; CHECK-LABEL: define i1 @addition_no_extra_facts_with_return_value_ult_2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[B:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[C:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[A]], [[C]]
+; CHECK-NEXT:    ret i1 [[F]]
+;
+entry:
+  %a = call i16 @get()
+  %b = call i16 @get()
+  %c = call i16 @get()
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %f = icmp uge i16 %a, %c
+  ret i1 %f
+}
+
+define i1 @addition_no_extra_facts_with_return_value_ult_3() {
+; CHECK-LABEL: define i1 @addition_no_extra_facts_with_return_value_ult_3() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[B:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[C:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i16 [[A]], 9
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %a = call i16 @get()
+  %b = call i16 @get()
+  %c = call i16 @get()
+  %add = add nuw nsw i16 %b, %a
+  %cmp.add = icmp ult i16 %add, %c
+  call void @llvm.assume(i1 %cmp.add)
+  %cmp = icmp uge i16 %a, 9
+  ret i1 %cmp
+}
+
+; Test for https://github.com/llvm/llvm-project/issues/63490.
+define i1 @assume_x_ugt_y_plus_y_via_shl_eq(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @assume_x_ugt_y_plus_y_via_shl_eq(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:    [[S:%.*]] = shl nuw i8 [[Y]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[C_2]]
+;
+  %s = shl nuw i8 %y, 1
+  %c.1 = icmp ugt i8 %x, %s
+  tail call void @llvm.assume(i1 %c.1)
+  %c.2 = icmp eq i8 %x, %y
+  ret i1 %c.2
+}
+
+define i1 @assume_x_ugt_y_plus_y_via_shl_eq_no_nuw(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @assume_x_ugt_y_plus_y_via_shl_eq_no_nuw(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:    [[S:%.*]] = shl i8 [[Y]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[C_2]]
+;
+  %s = shl i8 %y, 1
+  %c.1 = icmp ugt i8 %x, %s
+  tail call void @llvm.assume(i1 %c.1)
+  %c.2 = icmp eq i8 %x, %y
+  ret i1 %c.2
+}
+
+define i1 @assume_x_ugt_y_plus_y_via_add_eq(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @assume_x_ugt_y_plus_y_via_add_eq(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:    [[S:%.*]] = add nuw i8 [[Y]], [[Y]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[C_2]]
+;
+  %s = add nuw i8 %y, %y
+  %c.1 = icmp ugt i8 %x, %s
+  tail call void @llvm.assume(i1 %c.1)
+  %c.2 = icmp eq i8 %x, %y
+  ret i1 %c.2
+}
+
+define i1 @assume_x_ugt_y_plus_y_via_add_eq_no_nuw(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @assume_x_ugt_y_plus_y_via_add_eq_no_nuw(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:    [[S:%.*]] = add i8 [[Y]], [[Y]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[C_2]]
+;
+  %s = add i8 %y, %y
+  %c.1 = icmp ugt i8 %x, %s
+  tail call void @llvm.assume(i1 %c.1)
+  %c.2 = icmp eq i8 %x, %y
+  ret i1 %c.2
+}
+
+define i1 @assume_x_ugt_y_plus_y_via_shl_ne(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @assume_x_ugt_y_plus_y_via_shl_ne(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:    [[S:%.*]] = shl nuw i8 [[Y]], 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ne i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[C_2]]
+;
+  %s = shl nuw i8 %y, 1
+  %c.1 = icmp ugt i8 %x, %s
+  tail call void @llvm.assume(i1 %c.1)
+  %c.2 = icmp ne i8 %x, %y
+  ret i1 %c.2
+}

>From f6ca0ed0385f57adf814439dfa7585a00284a144 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Fri, 3 Nov 2023 15:00:54 +0000
Subject: [PATCH 35/76] [llvm][AArch64][Assembly]: Add SME_F8F16 and SME_F8F32
 Ass/Disass. (#70640)

This patch adds the feature flags of SME_F8F16 and SME_F8F32,
 and the assembly/disassembly for the following instructions of SME2:

  * SME:
    - FMLAL, FMLALL
    - FVDOT, FVDOTT
    - FVDOTB
    - FMOPA

That is according to this documentation:
https://developer.arm.com/documentation/ddi0602/2023-09

Co-authored-by: Caroline Concatto <caroline.concatto at arm.com>
---
 .../llvm/TargetParser/AArch64TargetParser.h   |   4 +
 llvm/lib/Target/AArch64/AArch64.td            |  13 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   4 +
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  53 +++
 llvm/lib/Target/AArch64/AArch64SchedA64FX.td  |   2 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |   3 +-
 llvm/lib/Target/AArch64/SMEInstrFormats.td    | 107 +++++-
 .../MC/AArch64/FP8_SME2/dot-diagnostics.s     | 215 +++++++++++
 llvm/test/MC/AArch64/FP8_SME2/dot.s           | 361 ++++++++++++++++++
 .../MC/AArch64/FP8_SME2/mla-diagnostics.s     | 195 ++++++++++
 llvm/test/MC/AArch64/FP8_SME2/mla.s           | 361 ++++++++++++++++++
 .../MC/AArch64/FP8_SME2/mopa-diagnostics.s    |  46 +++
 llvm/test/MC/AArch64/FP8_SME2/mopa.s          |  39 ++
 llvm/test/MC/AArch64/SME2/fmlal-diagnostics.s |   4 +-
 llvm/test/MC/AArch64/SME2/fvdot-diagnostics.s |   4 +-
 .../TargetParser/TargetParserTest.cpp         |   7 +-
 16 files changed, 1404 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/dot-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/dot.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/mla-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/mla.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/mopa-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/mopa.s

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 48dac9395d58314..c2f9bb290271353 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -170,6 +170,8 @@ enum ArchExtKind : unsigned {
   AEK_SSVE_FP8DOT4 =  66, // FEAT_SSVE_FP8DOT4
   AEK_LUT =           67, // FEAT_LUT
   AEK_SME_LUTv2 =     68, // FEAT_SME_LUTv2
+  AEK_SMEF8F16 =      69, // FEAT_SME_F8F16
+  AEK_SMEF8F32 =      70, // FEAT_SME_F8F32
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -289,6 +291,8 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"ssve-fp8dot4", AArch64::AEK_SSVE_FP8DOT4, "+ssve-fp8dot4", "-ssve-fp8dot4", FEAT_INIT, "+sme2", 0},
     {"lut", AArch64::AEK_LUT, "+lut", "-lut", FEAT_INIT, "", 0},
     {"sme-lutv2", AArch64::AEK_SME_LUTv2, "+sme-lutv2", "-sme-lutv2", FEAT_INIT, "", 0},
+    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+sme2,+fp8", 0},
+    {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 131086e12ce66b1..06ab560ce4108e1 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -540,6 +540,12 @@ def FeatureLUT: SubtargetFeature<"lut", "HasLUT", "true",
 def FeatureSME_LUTv2 : SubtargetFeature<"sme-lutv2", "HasSME_LUTv2", "true",
   "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">;
 
+def FeatureSMEF8F16 : SubtargetFeature<"sme-f8f16", "HasSMEF8F16", "true",
+  "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSME2, FeatureFP8]>;
+
+def FeatureSMEF8F32 : SubtargetFeature<"sme-f8f32", "HasSMEF8F32", "true",
+  "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>;
+
 def FeatureAppleA7SysReg  : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true",
   "Apple A7 (the CPU formerly known as Cyclone)">;
 
@@ -770,8 +776,8 @@ let F = [HasSVE2p1, HasSVE2p1_or_HasSME2, HasSVE2p1_or_HasSME2p1] in
 def SVE2p1Unsupported : AArch64Unsupported;
 
 def SVE2Unsupported : AArch64Unsupported {
-  let F = !listconcat([HasSVE2, HasSVE2orSME, HasSSVE_FP8FMA,
-                       HasSVE2AES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm],
+  let F = !listconcat([HasSVE2, HasSVE2orSME, HasSSVE_FP8FMA, HasSMEF8F16,
+                       HasSMEF8F32, HasSVE2AES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm],
                        SVE2p1Unsupported.F);
 }
 
@@ -784,7 +790,8 @@ let F = [HasSME2p1, HasSVE2p1_or_HasSME2p1] in
 def SME2p1Unsupported : AArch64Unsupported;
 
 def SME2Unsupported : AArch64Unsupported {
-  let F = !listconcat([HasSME2, HasSVE2p1_or_HasSME2, HasSSVE_FP8FMA],
+  let F = !listconcat([HasSME2, HasSVE2p1_or_HasSME2, HasSSVE_FP8FMA,
+                      HasSMEF8F16, HasSMEF8F32],
                       SME2p1Unsupported.F);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 7b62263e0d55426..685880a849de42f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -191,6 +191,10 @@ def HasLUT          : Predicate<"Subtarget->hasLUT()">,
                                  AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">;
 def HasSME_LUTv2     : Predicate<"Subtarget->hasSME_LUTv2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">;
+def HasSMEF8F16     : Predicate<"Subtarget->hasSMEF8F16()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">;
+def HasSMEF8F32     : Predicate<"Subtarget->hasSMEF8F32()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">;
 
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 7f568c9a225952e..bb9464a8d2e1cf2 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -894,3 +894,56 @@ def LUTI4_4ZZT2Z    : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
 let Predicates = [HasSME2p1, HasSME_LUTv2] in {
 def LUTI4_S_4ZZT2Z  : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
 } //[HasSME2p1, HasSME_LUTv2]
+
+let Predicates = [HasSMEF8F16] in {
+defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_16b<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG2_M2ZZI_BtoH  : sme2p1_multi_vec_array_vg2_index_16b<"fdot",    0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG4_M4ZZI_BtoH  : sme2p1_multi_vec_array_vg4_index_16b<"fdot",    0b100, ZZZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG2_M2ZZ_BtoH   :  sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
+defm FDOT_VG4_M4ZZ_BtoH   :  sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;
+// TODO: Replace nxv16i8 by nxv16f8
+defm FDOT_VG2_M2Z2Z_BtoH  : sme2_dot_mla_add_sub_array_vg2_multi<"fdot",    0b0100100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>;
+defm FDOT_VG4_M4Z4Z_BtoH  : sme2_dot_mla_add_sub_array_vg4_multi<"fdot",    0b0100100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>;
+
+def  FMLAL_MZZI_BtoH      : sme2_mla_ll_array_index_16b<"fmlal", 0b11, 0b00>;
+defm FMLAL_VG2_M2ZZI_BtoH : sme2_multi_vec_array_vg2_index_16b<"fmlal", 0b10, 0b111>;
+defm FMLAL_VG4_M4ZZI_BtoH : sme2_multi_vec_array_vg4_index_16b<"fmlal", 0b10, 0b110>;
+def  FMLAL_VG2_MZZ_BtoH   : sme2_mla_long_array_single_16b<"fmlal">;
+// TODO: Replace nxv16i8 by nxv16f8
+defm FMLAL_VG2_M2ZZ_BtoH  : sme2_fp_mla_long_array_vg2_single<"fmlal",  0b001, MatrixOp16, ZZ_b, ZPR4b8, nxv16i8, null_frag>;
+defm FMLAL_VG4_M4ZZ_BtoH  :  sme2_fp_mla_long_array_vg4_single<"fmlal", 0b001, MatrixOp16, ZZZZ_b, ZPR4b8, nxv16i8, null_frag>;
+defm FMLAL_VG2_M2Z2Z_BtoH : sme2_fp_mla_long_array_vg2_multi<"fmlal",   0b100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>;
+defm FMLAL_VG4_M4Z4Z_BtoH : sme2_fp_mla_long_array_vg4_multi<"fmlal",   0b100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>;
+
+defm FMOPA_MPPZZ_BtoH     : sme2p1_fmop_tile_fp16<"fmopa", 0b1, 0b0, 0b01, ZPR8>;
+
+} //[HasSMEF8F16]
+
+let Predicates = [HasSMEF8F32] in {
+// TODO : Replace nxv16i8 by nxv16f8
+defm FDOT_VG2_M2ZZI_BtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b01, 0b0111, ZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
+defm FDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"fdot", 0b0001, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, null_frag>;
+defm FDOT_VG2_M2ZZ_BtoS  : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010011, MatrixOp32, ZZ_b, ZPR4b8>;
+defm FDOT_VG4_M4ZZ_BtoS  : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110011, MatrixOp32, ZZZZ_b, ZPR4b8>;
+// TODO : Replace nxv16i8 by nxv16f8
+defm FDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot",   0b0100110, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>;
+defm FDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot",   0b0100110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>;
+
+def FVDOTB_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdotb", 0b0>;
+def FVDOTT_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdott", 0b1>;
+
+defm FMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"fmlall",     0b01, 0b000, null_frag>;
+defm FMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"fmlall", 0b10, 0b100, null_frag>;
+defm FMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"fmlall", 0b00, 0b1000, null_frag>;
+// TODO: Replace nxv16i8 by nxv16f8
+defm FMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"fmlall",      0b01000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, null_frag>;
+defm FMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg24_single<"fmlall", 0b000001, MatrixOp32, ZZ_b, ZPR4b8>;
+defm FMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg24_single<"fmlall", 0b010001, MatrixOp32, ZZZZ_b, ZPR4b8>;
+defm FMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"fmlall",   0b01000, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>;
+defm FMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"fmlall",   0b01000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>;
+
+
+defm FMOPA_MPPZZ_BtoS : sme_outer_product_fp32<0b0, 0b01, ZPR8, "fmopa", null_frag>;
+
+} //[HasSMEF8F32]
+
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index 1de0e8291ef12d9..cb0d8b79ae7b9c9 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -23,7 +23,7 @@ def A64FXModel : SchedMachineModel {
   list<Predicate> UnsupportedFeatures =
     [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth,
      HasSVE2orSME, HasMTE, HasMatMulInt8, HasBF16, HasSME2, HasSME2p1, HasSVE2p1,
-     HasSVE2p1_or_HasSME2p1, HasSMEF16F16, HasSSVE_FP8FMA];
+     HasSVE2p1_or_HasSME2p1, HasSMEF16F16, HasSSVE_FP8FMA, HasSMEF8F16, HasSMEF8F32];
 
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index b3a2c6a915246bb..7b59f1d6b5c1103 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3660,6 +3660,8 @@ static const struct Extension {
     {"ssve-fp8dot4", {AArch64::FeatureSSVE_FP8DOT4}},
     {"lut", {AArch64::FeatureLUT}},
     {"sme-lutv2", {AArch64::FeatureSME_LUTv2}},
+    {"sme-f8f16", {AArch64::FeatureSMEF8F16}},
+    {"sme-f8f32", {AArch64::FeatureSMEF8F32}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
@@ -4578,7 +4580,6 @@ ParseStatus AArch64AsmParser::tryParseZTOperand(OperandVector &Operands) {
     Operands.push_back(
         AArch64Operand::CreateToken("]", getLoc(), getContext()));
   }
-
   return ParseStatus::Success;
 }
 
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index ee943c87bc1235e..4f40fa538b0c3c7 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -1922,6 +1922,17 @@ multiclass sme2_mla_long_array_single<string mnemonic, bits<2> op0, bits<2> op,
   def : SME2_ZA_TwoOp_Multi_Single_Pat<NAME # _HtoS, intrinsic, uimm3s2range, ZPR4b16, zpr_ty, tileslicerange3s2>;
 }
 
+class sme2_mla_long_array_single_16b<string mnemonic>
+    : sme2_mla_long_array<0b00, 0b00, MatrixOp16, uimm3s2range, ZPR8, ZPR4b8,  mnemonic> {
+    bits<4> Zm;
+    bits<5> Zn;
+    bits<3> imm;
+    let Inst{20}    = 0b1;
+    let Inst{19-16} = Zm;
+    let Inst{9-5}   = Zn;
+    let Inst{2-0}   = imm;
+}
+
 class sme2_mla_long_array_vg24_single<bits<2> op0, bit vg4, bits<2> op, bit o2,
                                       MatrixOperand matrix_ty, RegisterOperand multi_vector_ty,
                                       ZPRRegOp zpr_ty, string mnemonic, string vg_acronym>
@@ -1937,7 +1948,6 @@ class sme2_mla_long_array_vg24_single<bits<2> op0, bit vg4, bits<2> op, bit o2,
   let Inst{1-0}   = imm;
 }
 
-	
 multiclass sme2_fp_mla_long_array_vg2_single<string mnemonic, bits<3> op, MatrixOperand matrix_ty,
                                              RegisterOperand multi_vector_ty, ZPRRegOp vector_ty,
                                              ValueType zpr_ty, SDPatternOperator intrinsic> {
@@ -1971,7 +1981,8 @@ multiclass sme2_fp_mla_long_array_vg4_single<string mnemonic, bits<3> op, Matrix
                                              RegisterOperand multi_vector_ty, ZPRRegOp vector_ty,
                                              ValueType zpr_ty, SDPatternOperator intrinsic> {
   def NAME : sme2_mla_long_array_vg24_single<0b00, 0b1, op{2-1}, op{0}, matrix_ty, multi_vector_ty, 
-                                             vector_ty, mnemonic, "vgx4">, SMEPseudo2Instr<NAME, 1>;
+                                             vector_ty, mnemonic, "vgx4">,
+                                             SMEPseudo2Instr<NAME, 1>;
 
   def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, uimm2s2range, multi_vector_ty, vector_ty,
                                                       SMEMatrixArray>;
@@ -2390,7 +2401,6 @@ multiclass sme2_zip_vector_vg2<string mnemonic, bit op> {
 
 //===----------------------------------------------------------------------===//
 // SME2 Dot Products and MLA
-
 class sme2_multi_vec_array_vg2_index<bits<2> sz, bits<6> op, MatrixOperand matrix_ty,
                                      RegisterOperand multi_vector_ty,
                                      ZPRRegOp vector_ty, Operand index_ty,
@@ -2428,7 +2438,6 @@ multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<2> sz, bits<
     bits<2> i;
     let Inst{11-10} = i;
   }
-
   def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexS32b_timm, SMEMatrixArray>;
 
   def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
@@ -2439,6 +2448,7 @@ multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<2> sz, bits<
 }
 
 // SME2.1 multi-vec ternary indexed two registers 16-bit
+// SME2 multi-vec indexed FP8 two-way dot product to FP16 two registers
 multiclass sme2p1_multi_vec_array_vg2_index_16b<string mnemonic, bits<2> sz, bits<3> op,
                                                 RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> {
   def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
@@ -2448,11 +2458,24 @@ multiclass sme2p1_multi_vec_array_vg2_index_16b<string mnemonic, bits<2> sz, bit
     let Inst{11-10} = i{2-1};
     let Inst{3}     = i{0};
   }
+
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
         (!cast<Instruction>(NAME) MatrixOp16:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
         multi_vector_ty:$Zn, zpr_ty:$Zm, VectorIndexH:$i), 0>;
 }
 
+// SME2 multi-vec indexed FP8 two-way vertical dot product to single precision
+// two registers
+class sme2_fp8_multi_vec_array_vg4_index<string mnemonic, bit T>
+   : sme2_multi_vec_array_vg2_index<0b11, {0b01,?,0b0, T,?}, MatrixOp32,
+                                    ZZ_b_mul_r, ZPR4b8, VectorIndexS, mnemonic> {
+
+  bits<2> i;
+  let Inst{10} = i{1};
+  let Inst{3}  = i{0};
+  let AsmString = !strconcat(mnemonic, "{\t$ZAda[$Rv, $imm3, vgx4], $Zn, $Zm$i}");
+}
+
 // SME2 multi-vec ternary indexed two registers 64-bit
 
 class sme2_multi_vec_array_vg2_index_64b<bits<2> op,
@@ -2608,7 +2631,83 @@ multiclass sme2_multi_vec_array_vg4_index_64b<string mnemonic, bits<3> op,
         (!cast<Instruction>(NAME) MatrixOp64:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
         multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1), 0>;
 }
+
+// FMLAL (multiple and indexed vector, FP8 to FP16)
+class sme2_multi_vec_array_vg24_index_16b<bits<2> sz, bit vg4, bits<3> op,
+                                          RegisterOperand multi_vector_ty, string mnemonic>
+    : I<(outs MatrixOp16:$ZAda),
+        (ins MatrixOp16:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2,
+         multi_vector_ty:$Zn, ZPR4b8:$Zm, VectorIndexB:$i),
+         mnemonic, "\t$ZAda[$Rv, $imm2, " # !if(vg4, "vgx4", "vgx2") # "], $Zn, $Zm$i",
+         "", []>, Sched<[]> {
+  bits<4> Zm;
+  bits<2> Rv;
+  bits<4> i;
+  bits<2> imm2;
+  let Inst{31-24} = 0b11000001;
+  let Inst{23-22} = sz;
+  let Inst{21-20} = 0b01;
+  let Inst{19-16} = Zm;
+  let Inst{15}    = vg4;
+  let Inst{14-13} = Rv;
+  let Inst{12}    = op{2};
+  let Inst{11-10} = i{3-2};
+  let Inst{5-4}   = op{1-0};
+  let Inst{3-2}   = i{1-0};
+  let Inst{1-0}   = imm2;
+
+  let Constraints = "$ZAda = $_ZAda";
+}
+
+multiclass sme2_multi_vec_array_vg2_index_16b<string mnemonic, bits<2> sz, bits<3>op> {
+  def NAME : sme2_multi_vec_array_vg24_index_16b<sz, 0b0, op, ZZ_b_mul_r, mnemonic> {
+    bits<4> Zn;
+    let Inst{9-6} = Zn;
+ }
+ def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm2], $Zn, $Zm$i",
+                 (!cast<Instruction>(NAME) MatrixOp16:$ZAda,  MatrixIndexGPR32Op8_11:$Rv,
+                  uimm2s2range:$imm2, ZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB:$i), 0>;
+}
+
+multiclass sme2_multi_vec_array_vg4_index_16b<string mnemonic, bits<2>sz, bits<3>op> {
+  def NAME: sme2_multi_vec_array_vg24_index_16b<sz, 0b1, op, ZZZZ_b_mul_r, mnemonic> {
+    bits<3> Zn;
+    let Inst{9-7} = Zn;
+    let Inst{6}   = 0b0;
+  }
+ def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm2], $Zn, $Zm$i",
+                 (!cast<Instruction>(NAME) MatrixOp16:$ZAda,  MatrixIndexGPR32Op8_11:$Rv,
+                  uimm2s2range:$imm2, ZZZZ_b_mul_r:$Zn, ZPR4b8:$Zm, VectorIndexB:$i), 0>;
+}
+
 //===----------------------------------------------------------------------===//
+// SME2 multi-vec indexed long long MLA one source 16-bit
+class sme2_mla_ll_array_index_16b<string mnemonic, bits<2> sz,bits<2> op>
+    : I<(outs MatrixOp16:$ZAda),
+        (ins MatrixOp16:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm3s2range:$imm3, ZPR8:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i),
+        mnemonic, "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+        "", []>, Sched<[]> {
+  bits<4> Zm;
+  bits<2> Rv;
+  bits<4> i;
+  bits<5> Zn;
+  bits<3> imm3;
+  let Inst{31-24} = 0b11000001;
+  let Inst{23-22} = sz;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = Zm;
+  let Inst{15}    = i{3};
+  let Inst{14-13} = Rv;
+  let Inst{12}    = op{1};
+  let Inst{11-10} = i{2-1};
+  let Inst{9-5}   = Zn;
+  let Inst{4}     = op{0};
+  let Inst{3}     = i{0};
+  let Inst{2-0}   = imm3;
+
+  let Constraints = "$ZAda = $_ZAda";
+}
+
 // SME2 multi-vec indexed long long MLA one source 32-bit
 class sme2_mla_ll_array_index_32b<string mnemonic, bits<2> sz, bits<3> op>
     : I<(outs MatrixOp32:$ZAda),
diff --git a/llvm/test/MC/AArch64/FP8_SME2/dot-diagnostics.s b/llvm/test/MC/AArch64/FP8_SME2/dot-diagnostics.s
new file mode 100644
index 000000000000000..204213d0bb75666
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/dot-diagnostics.s
@@ -0,0 +1,215 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16,+sme-f8f32  2>&1 < %s | FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid vector select register
+
+fdot    za.h[w8, 0, vgx2], {z0.h-z1.h}, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot    za.h[w8, 0, vgx2], {z0.h-z1.h}, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.h[w11, 7], {z31.b-z2.b}, z15
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
+// CHECK-NEXT: fdot    za.h[w11, 7], {z31.b-z2.b}, z15
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.h[w11, 7, vgx2], {z28.b-z31.b}, {z0.b-z3.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot    za.h[w11, 7, vgx2], {z28.b-z31.b}, {z0.b-z3.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.s[w11, 7], {z29.b-z30.b}, {z30.b-z31.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot    za.s[w11, 7], {z29.b-z30.b}, {z30.b-z31.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.h[w11, 7], {z30.b-z0.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot    za.h[w11, 7], {z30.b-z0.b}, z15.
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+// --------------------------------------------------------------------------//
+// Invalid vector select offset
+
+fvdott  za.s[w11, -1, vgx4], {z30.b-z31.b}, z15.b[3]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [0, 7].
+// CHECK-NEXT: fvdott  za.s[w11, -1, vgx4], {z30.b-z31.b}, z15.b[3]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fvdott  za.s[w8, -1, vgx4], {z0.b-z1.b}, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [0, 7].
+// CHECK-NEXT: fvdott  za.s[w8, -1, vgx4], {z0.b-z1.b}, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fvdot   za.h[w11, -1], {z30.b-z31.b}, z15.b[7]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [0, 7].
+// CHECK-NEXT: fvdot   za.h[w11, -1], {z30.b-z31.b}, z15.b[7]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.s[w11, -1], {z28.b-z31.b}, z15.b[3]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [0, 7].
+// CHECK-NEXT: fdot    za.s[w11, -1], {z28.b-z31.b}, z15.b[3]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fvdott  za.s[w11, 8, vgx4], {z30.b-z31.b}, z15.b[3]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [0, 7].
+// CHECK-NEXT: fvdott  za.s[w11, 8, vgx4], {z30.b-z31.b}, z15.b[3]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fvdott  za.s[w8, 8, vgx4], {z0.b-z1.b}, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [0, 7].
+// CHECK-NEXT: fvdott  za.s[w8, 8, vgx4], {z0.b-z1.b}, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fvdot   za.h[w11, 8], {z30.b-z31.b}, z15.b[7]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [0, 7].
+// CHECK-NEXT: fvdot   za.h[w11, 8], {z30.b-z31.b}, z15.b[7]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.s[w11, 8], {z28.b-z31.b}, z15.b[3]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [0, 7].
+// CHECK-NEXT: fdot    za.s[w11, 8], {z28.b-z31.b}, z15.b[3]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vector list
+
+fdot    za.s[w11, 7, vgx4], {z29.b-z1.b}, {z29.b-z1.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: fdot    za.s[w11, 7, vgx4], {z29.b-z1.b}, {z29.b-z1.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.h[w11, 7], {z30.b-z2.b}, {z0.b-z3.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: fdot    za.h[w11, 7], {z30.b-z2.b}, {z0.b-z3.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.s[w8, 0], {z31.b-z3.b}, {z31.b-z3.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: fdot    za.s[w8, 0], {z31.b-z3.b}, {z31.b-z3.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.s[w11, 7, vgx2], {z30.b-z31.b}, {z0.b-z4.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: fdot    za.s[w11, 7, vgx2], {z30.b-z31.b}, {z0.b-z4.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid Register Suffix
+fdot    za.d[w11, 7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fdot    za.d[w11, 7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za[w11, 7], {z28.b-z31.b}, {z28.b-z31.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fdot    za[w11, 7], {z28.b-z31.b}, {z28.b-z31.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.b[w11, 7], {z31.b-z0.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fdot    za.b[w11, 7], {z31.b-z0.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.b[w11, 7, vgx2], {z30.h-z31.h}, {z30.h-z31.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fdot    za.b[w11, 7, vgx2], {z30.h-z31.h}, {z30.h-z31.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za[w11, 7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fdot    za[w11, 7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.d[w11, 7], {z28.b-z31.b}, {z28.b-z31.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fdot    za.d[w11, 7], {z28.b-z31.b}, {z28.b-z31.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vector select register
+
+fdot    za.h[w7, 7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [w8, w11]
+// CHECK-NEXT: fdot    za.h[w7, 7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.h[w, 0, vgx2], {z0.b-z1.b}, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [w8, w11]
+// CHECK-NEXT: fdot    za.h[w, 0, vgx2], {z0.b-z1.b}, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.s[w12, 0], {z0.b-z3.b}, {z0.b-z3.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [w8, w11]
+// CHECK-NEXT: fdot    za.s[w12, 0], {z0.b-z3.b}, {z0.b-z3.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid indexed-vector or single-vector register
+
+fdot za.h[w8, 0], {z0.b-z1.b}, z16.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z15.b
+// CHECK-NEXT: fdot za.h[w8, 0], {z0.b-z1.b}, z16.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot za.s[w8, 0], {z0.b-z1.b}, z16.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z15.b
+// CHECK-NEXT:  fdot za.s[w8, 0], {z0.b-z1.b}, z16.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vector grouping
+
+fdot    za.h[w11, 7], {z28.b-z31.b}, {z0.b-z2.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot    za.h[w11, 7], {z28.b-z31.b}, {z0.b-z2.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.h[w11, 7, vgx4], {z31.b-z0.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot    za.h[w11, 7, vgx4], {z31.b-z0.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid lane index
+
+fdot    za.h[w8, 0, vgx2], {z0.b-z1.b}, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: fdot    za.h[w8, 0, vgx2], {z0.b-z1.b}, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.h[w11, 7], {z30.b-z31.b}, z15.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: fdot    za.h[w11, 7], {z30.b-z31.b}, z15.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.s[w8, 0], {z0.b-z1.b}, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot    za.s[w8, 0], {z0.b-z1.b}, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot    za.s[w11, 7], {z30.b-z31.b}, z15.b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot    za.s[w11, 7], {z30.b-z31.b}, z15.b[4]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fvdot   za.h[w8, 0, vgx2], {z0.b-z1.b}, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: fvdot   za.h[w8, 0, vgx2], {z0.b-z1.b}, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fvdot   za.h[w11, 7], {z30.b-z31.b}, z15.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: fvdot   za.h[w11, 7], {z30.b-z31.b}, z15.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fvdotb  za.s[w8, 0, vgx4], {z0.b-z1.b}, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fvdotb  za.s[w8, 0, vgx4], {z0.b-z1.b}, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fvdott  za.s[w11, 7, vgx4], {z30.b-z31.b}, z15.b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fvdott  za.s[w11, 7, vgx4], {z30.b-z31.b}, z15.b[4]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8_SME2/dot.s b/llvm/test/MC/AArch64/FP8_SME2/dot.s
new file mode 100644
index 000000000000000..e5c717462dce6ee
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/dot.s
@@ -0,0 +1,361 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f8f16,+sme-f8f32 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme-f8f16,-sme-f8f32 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f8f16,+sme-f8f32 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
+// FDOT
+// x2
+
+fdot    za.h[w8, 0, vgx2], {z0.b-z1.b}, z0.b  // 11000001-00100000-00010000-00001000
+// CHECK-INST: fdot    za.h[w8, 0, vgx2], { z0.b, z1.b }, z0.b
+// CHECK-ENCODING: [0x08,0x10,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1201008 <unknown>
+
+fdot    za.h[w8, 0], {z0.b-z1.b}, z0.b  // 11000001-00100000-00010000-00001000
+// CHECK-INST: fdot    za.h[w8, 0, vgx2], { z0.b, z1.b }, z0.b
+// CHECK-ENCODING: [0x08,0x10,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1201008 <unknown>
+
+fdot    za.h[w11, 7], {z13.b-z14.b}, z8.b  // 11000001-00101000-01110001-10101111
+// CHECK-INST: fdot    za.h[w11, 7, vgx2], { z13.b, z14.b }, z8.b
+// CHECK-ENCODING: [0xaf,0x71,0x28,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c12871af <unknown>
+
+fdot    za.h[w11, 7, vgx2], {z31.b-z0.b}, z15.b  // 11000001-00101111-01110011-11101111
+// CHECK-INST: fdot    za.h[w11, 7, vgx2], { z31.b, z0.b }, z15.b
+// CHECK-ENCODING: [0xef,0x73,0x2f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c12f73ef <unknown>
+
+fdot    za.s[w8, 0, vgx2], {z0.b-z1.b}, z0.b  // 11000001-00100000-00010000-00011000
+// CHECK-INST: fdot    za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b
+// CHECK-ENCODING: [0x18,0x10,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1201018 <unknown>
+
+fdot    za.s[w8, 0], {z0.b-z1.b}, z0.b  // 11000001-00100000-00010000-00011000
+// CHECK-INST: fdot    za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b
+// CHECK-ENCODING: [0x18,0x10,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1201018 <unknown>
+
+fdot    za.s[w11, 7, vgx2], {z31.b-z0.b}, z15.b  // 11000001-00101111-01110011-11111111
+// CHECK-INST: fdot    za.s[w11, 7, vgx2], { z31.b, z0.b }, z15.b
+// CHECK-ENCODING: [0xff,0x73,0x2f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c12f73ff <unknown>
+
+fdot    za.s[w11, 7], {z31.b-z0.b}, z15.b  // 11000001-00101111-01110011-11111111
+// CHECK-INST: fdot    za.s[w11, 7, vgx2], { z31.b, z0.b }, z15.b
+// CHECK-ENCODING: [0xff,0x73,0x2f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c12f73ff <unknown>
+
+fdot    za.h[w8, 0, vgx2], {z0.b-z1.b}, {z0.b-z1.b}  // 11000001-10100000-00010000-00100000
+// CHECK-INST: fdot    za.h[w8, 0, vgx2], { z0.b, z1.b }, { z0.b, z1.b }
+// CHECK-ENCODING: [0x20,0x10,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1a01020 <unknown>
+
+fdot    za.h[w8, 0], {z0.b-z1.b}, {z0.b-z1.b}  // 11000001-10100000-00010000-00100000
+// CHECK-INST: fdot    za.h[w8, 0, vgx2], { z0.b, z1.b }, { z0.b, z1.b }
+// CHECK-ENCODING: [0x20,0x10,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1a01020 <unknown>
+
+fdot    za.h[w11, 7, vgx2], {z30.b-z31.b}, {z30.b-z31.b}  // 11000001-10111110-01110011-11100111
+// CHECK-INST: fdot    za.h[w11, 7, vgx2], { z30.b, z31.b }, { z30.b, z31.b }
+// CHECK-ENCODING: [0xe7,0x73,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1be73e7 <unknown>
+
+fdot    za.h[w11, 7], {z30.b-z31.b}, {z30.b-z31.b}  // 11000001-10111110-01110011-11100111
+// CHECK-INST: fdot    za.h[w11, 7, vgx2], { z30.b, z31.b }, { z30.b, z31.b }
+// CHECK-ENCODING: [0xe7,0x73,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1be73e7 <unknown>
+
+fdot    za.s[w8, 0, vgx2], {z0.b-z1.b}, {z0.b-z1.b}  // 11000001-10100000-00010000-00110000
+// CHECK-INST: fdot    za.s[w8, 0, vgx2], { z0.b, z1.b }, { z0.b, z1.b }
+// CHECK-ENCODING: [0x30,0x10,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1a01030 <unknown>
+
+fdot    za.s[w8, 0], {z0.b-z1.b}, {z0.b-z1.b}  // 11000001-10100000-00010000-00110000
+// CHECK-INST: fdot    za.s[w8, 0, vgx2], { z0.b, z1.b }, { z0.b, z1.b }
+// CHECK-ENCODING: [0x30,0x10,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1a01030 <unknown>
+
+fdot    za.s[w11, 7, vgx2], {z30.b-z31.b}, {z30.b-z31.b}  // 11000001-10111110-01110011-11110111
+// CHECK-INST: fdot    za.s[w11, 7, vgx2], { z30.b, z31.b }, { z30.b, z31.b }
+// CHECK-ENCODING: [0xf7,0x73,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1be73f7 <unknown>
+
+fdot    za.s[w11, 7], {z30.b-z31.b}, {z30.b-z31.b}  // 11000001-10111110-01110011-11110111
+// CHECK-INST: fdot    za.s[w11, 7, vgx2], { z30.b, z31.b }, { z30.b, z31.b }
+// CHECK-ENCODING: [0xf7,0x73,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1be73f7 <unknown>
+
+fdot    za.h[w8, 0, vgx2], {z0.b-z1.b}, z0.b[0]  // 11000001-11010000-00000000-00100000
+// CHECK-INST: fdot    za.h[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x20,0x00,0xd0,0xc1]
+// CHECK-ERROR:  instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1d00020 <unknown>
+
+fdot    za.h[w8, 0], {z0.b-z1.b}, z0.b[0]  // 11000001-11010000-00000000-00100000
+// CHECK-INST: fdot    za.h[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x20,0x00,0xd0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1d00020 <unknown>
+
+fdot    za.h[w11, 7, vgx2], {z30.b-z31.b}, z15.b[7]  // 11000001-11011111-01101111-11101111
+// CHECK-INST: fdot    za.h[w11, 7, vgx2], { z30.b, z31.b }, z15.b[7]
+// CHECK-ENCODING: [0xef,0x6f,0xdf,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1df6fef <unknown>
+
+fdot    za.h[w11, 7], {z30.b-z31.b}, z15.b[7]  // 11000001-11011111-01101111-11101111
+// CHECK-INST: fdot    za.h[w11, 7, vgx2], { z30.b, z31.b }, z15.b[7]
+// CHECK-ENCODING: [0xef,0x6f,0xdf,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1df6fef <unknown>
+
+fdot    za.s[w8, 0, vgx2], {z0.b-z1.b}, z0.b[0]  // 11000001-01010000-00000000-00111000
+// CHECK-INST: fdot    za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x38,0x00,0x50,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1500038 <unknown>
+
+fdot    za.s[w8, 0], {z0.b-z1.b}, z0.b[0]  // 11000001-01010000-00000000-00111000
+// CHECK-INST: fdot    za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x38,0x00,0x50,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1500038 <unknown>
+
+fdot    za.s[w11, 7, vgx2], {z30.b-z31.b}, z15.b[3]  // 11000001-01011111-01101111-11111111
+// CHECK-INST: fdot    za.s[w11, 7, vgx2], { z30.b, z31.b }, z15.b[3]
+// CHECK-ENCODING: [0xff,0x6f,0x5f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c15f6fff <unknown>
+
+fdot    za.s[w11, 7], {z30.b-z31.b}, z15.b[3]  // 11000001-01011111-01101111-11111111
+// CHECK-INST: fdot    za.s[w11, 7, vgx2], { z30.b, z31.b }, z15.b[3]
+// CHECK-ENCODING: [0xff,0x6f,0x5f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c15f6fff <unknown>
+
+// x4
+
+
+fdot    za.h[w8, 0, vgx4], {z0.b-z3.b}, z0.b  // 11000001-00110000-00010000-00001000
+// CHECK-INST: fdot    za.h[w8, 0, vgx4], { z0.b - z3.b }, z0.b
+// CHECK-ENCODING: [0x08,0x10,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1301008 <unknown>
+
+fdot    za.h[w8, 0], {z0.b-z3.b}, z0.b  // 11000001-00110000-00010000-00001000
+// CHECK-INST: fdot    za.h[w8, 0, vgx4], { z0.b - z3.b }, z0.b
+// CHECK-ENCODING: [0x08,0x10,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1301008 <unknown>
+
+fdot    za.h[w11, 7, vgx4], {z31.b-z2.b}, z15.b  // 11000001-00111111-01110011-11101111
+// CHECK-INST: fdot    za.h[w11, 7, vgx4], {  z31.b, z0.b, z1.b, z2.b  }, z15.b
+// CHECK-ENCODING: [0xef,0x73,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c13f73ef <unknown>
+
+fdot    za.h[w11, 7], {z31.b-z2.b}, z15.b  // 11000001-00111111-01110011-11101111
+// CHECK-INST: fdot    za.h[w11, 7, vgx4], {  z31.b, z0.b, z1.b, z2.b  }, z15.b
+// CHECK-ENCODING: [0xef,0x73,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c13f73ef <unknown>
+
+fdot    za.s[w8, 0, vgx4], {z0.b-z3.b}, z0.b  // 11000001-00110000-00010000-00011000
+// CHECK-INST: fdot    za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b
+// CHECK-ENCODING: [0x18,0x10,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1301018 <unknown>
+
+fdot    za.s[w8, 0], {z0.b-z3.b}, z0.b  // 11000001-00110000-00010000-00011000
+// CHECK-INST: fdot    za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b
+// CHECK-ENCODING: [0x18,0x10,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1301018 <unknown>
+
+fdot    za.s[w11, 7, vgx4], {z31.b-z2.b}, z15.b  // 11000001-00111111-01110011-11111111
+// CHECK-INST: fdot    za.s[w11, 7, vgx4], {  z31.b, z0.b, z1.b, z2.b  }, z15.b
+// CHECK-ENCODING: [0xff,0x73,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c13f73ff <unknown>
+
+fdot    za.s[w11, 7], {z31.b-z2.b}, z15.b  // 11000001-00111111-01110011-11111111
+// CHECK-INST: fdot    za.s[w11, 7, vgx4], {  z31.b, z0.b, z1.b, z2.b  }, z15.b
+// CHECK-ENCODING: [0xff,0x73,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c13f73ff <unknown>
+
+fdot    za.h[w8, 0, vgx4], {z0.b-z3.b}, {z0.b-z3.b}  // 11000001-10100001-00010000-00100000
+// CHECK-INST: fdot    za.h[w8, 0, vgx4], { z0.b - z3.b }, { z0.b - z3.b }
+// CHECK-ENCODING: [0x20,0x10,0xa1,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1a11020 <unknown>
+
+fdot    za.h[w8, 0], {z0.b-z3.b}, {z0.b-z3.b}  // 11000001-10100001-00010000-00100000
+// CHECK-INST: fdot    za.h[w8, 0, vgx4], { z0.b - z3.b }, { z0.b - z3.b }
+// CHECK-ENCODING: [0x20,0x10,0xa1,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1a11020 <unknown>
+
+fdot    za.h[w11, 7, vgx4], {z28.b-z31.b}, {z28.b-z31.b}  // 11000001-10111101-01110011-10100111
+// CHECK-INST: fdot    za.h[w11, 7, vgx4], { z28.b - z31.b }, { z28.b - z31.b }
+// CHECK-ENCODING: [0xa7,0x73,0xbd,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1bd73a7 <unknown>
+
+fdot    za.h[w11, 7], {z28.b-z31.b}, {z28.b-z31.b}  // 11000001-10111101-01110011-10100111
+// CHECK-INST: fdot    za.h[w11, 7, vgx4], { z28.b - z31.b }, { z28.b - z31.b }
+// CHECK-ENCODING: [0xa7,0x73,0xbd,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1bd73a7 <unknown>
+fdot    za.h[w8, 0, vgx4], {z0.b-z3.b}, z0.b[0]  // 11000001-00010000-10010000-01000000
+// CHECK-INST: fdot    za.h[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+// CHECK-ENCODING: [0x40,0x90,0x10,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1109040 <unknown>
+
+fdot    za.s[w8, 0, vgx4], {z0.b-z3.b}, {z0.b-z3.b}  // 11000001-10100001-00010000-00110000
+// CHECK-INST: fdot    za.s[w8, 0, vgx4], { z0.b - z3.b }, { z0.b - z3.b }
+// CHECK-ENCODING: [0x30,0x10,0xa1,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1a11030 <unknown>
+
+fdot    za.s[w8, 0], {z0.b-z3.b}, {z0.b-z3.b}  // 11000001-10100001-00010000-00110000
+// CHECK-INST: fdot    za.s[w8, 0, vgx4], { z0.b - z3.b }, { z0.b - z3.b }
+// CHECK-ENCODING: [0x30,0x10,0xa1,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1a11030 <unknown>
+
+fdot    za.s[w11, 7, vgx4], {z28.b-z31.b}, {z28.b-z31.b}  // 11000001-10111101-01110011-10110111
+// CHECK-INST: fdot    za.s[w11, 7, vgx4], { z28.b - z31.b }, { z28.b - z31.b }
+// CHECK-ENCODING: [0xb7,0x73,0xbd,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1bd73b7 <unknown>
+
+fdot    za.s[w11, 7], {z28.b-z31.b}, {z28.b-z31.b}  // 11000001-10111101-01110011-10110111
+// CHECK-INST: fdot    za.s[w11, 7, vgx4], { z28.b - z31.b }, { z28.b - z31.b }
+// CHECK-ENCODING: [0xb7,0x73,0xbd,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1bd73b7 <unknown>
+
+fdot    za.h[w8, 0], {z0.b-z3.b}, z0.b[0]  // 11000001-00010000-10010000-01000000
+// CHECK-INST: fdot    za.h[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+// CHECK-ENCODING: [0x40,0x90,0x10,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1109040 <unknown>
+
+fdot    za.h[w11, 7, vgx4], {z28.b-z31.b}, z15.b[7]  // 11000001-00011111-11111111-11001111
+// CHECK-INST: fdot    za.h[w11, 7, vgx4], { z28.b - z31.b }, z15.b[7]
+// CHECK-ENCODING: [0xcf,0xff,0x1f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c11fffcf <unknown>
+
+fdot    za.h[w11, 7], {z28.b-z31.b}, z15.b[7]  // 11000001-00011111-11111111-11001111
+// CHECK-INST: fdot    za.h[w11, 7, vgx4], { z28.b - z31.b }, z15.b[7]
+// CHECK-ENCODING: [0xcf,0xff,0x1f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c11fffcf <unknown>
+
+fdot    za.s[w8, 0, vgx4], {z0.b-z3.b}, z0.b[0]  // 11000001-01010000-10000000-00001000
+// CHECK-INST: fdot    za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+// CHECK-ENCODING: [0x08,0x80,0x50,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1508008 <unknown>
+
+fdot    za.s[w8, 0], {z0.b-z3.b}, z0.b[0]  // 11000001-01010000-10000000-00001000
+// CHECK-INST: fdot    za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+// CHECK-ENCODING: [0x08,0x80,0x50,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1508008 <unknown>
+
+fdot    za.s[w11, 7, vgx4], {z28.b-z31.b}, z15.b[3]  // 11000001-01011111-11101111-10001111
+// CHECK-INST: fdot    za.s[w11, 7, vgx4], { z28.b - z31.b }, z15.b[3]
+// CHECK-ENCODING: [0x8f,0xef,0x5f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c15fef8f <unknown>
+
+fdot    za.s[w11, 7], {z28.b-z31.b}, z15.b[3]  // 11000001-01011111-11101111-10001111
+// CHECK-INST: fdot    za.s[w11, 7, vgx4], { z28.b - z31.b }, z15.b[3]
+// CHECK-ENCODING: [0x8f,0xef,0x5f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c15fef8f <unknown>
+
+
+// FVDOT
+
+fvdot   za.h[w8, 0, vgx2], {z0.b-z1.b}, z0.b[0]  // 11000001-11010000-00010000-00100000
+// CHECK-INST: fvdot   za.h[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x20,0x10,0xd0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1d01020 <unknown>
+
+fvdot   za.h[w8, 0], {z0.b-z1.b}, z0.b[0]  // 11000001-11010000-00010000-00100000
+// CHECK-INST: fvdot   za.h[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x20,0x10,0xd0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1d01020 <unknown>
+
+fvdot   za.h[w11, 7, vgx2], {z30.b-z31.b}, z15.b[7]  // 11000001-11011111-01111111-11101111
+// CHECK-INST: fvdot   za.h[w11, 7, vgx2], { z30.b, z31.b }, z15.b[7]
+// CHECK-ENCODING: [0xef,0x7f,0xdf,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1df7fef <unknown>
+
+fvdot   za.h[w11, 7], {z30.b-z31.b}, z15.b[7]  // 11000001-11011111-01111111-11101111
+// CHECK-INST: fvdot   za.h[w11, 7, vgx2], { z30.b, z31.b }, z15.b[7]
+// CHECK-ENCODING: [0xef,0x7f,0xdf,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1df7fef <unknown>
+
+// FVDOTB
+
+fvdotb  za.s[w8, 0, vgx4], {z0.b-z1.b}, z0.b[0]  // 11000001-11010000-00001000-00000000
+// CHECK-INST: fvdotb  za.s[w8, 0, vgx4], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x00,0x08,0xd0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1d00800 <unknown>
+
+fvdotb  za.s[w11, 7, vgx4], {z30.b-z31.b}, z15.b[3]  // 11000001-11011111-01101111-11001111
+// CHECK-INST: fvdotb  za.s[w11, 7, vgx4], { z30.b, z31.b }, z15.b[3]
+// CHECK-ENCODING: [0xcf,0x6f,0xdf,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1df6fcf <unknown>
+
+// FVDOTT
+fvdott  za.s[w8, 0, vgx4], {z0.b-z1.b}, z0.b[0]  // 11000001-11010000-00001000-00010000
+// CHECK-INST: fvdott  za.s[w8, 0, vgx4], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x10,0x08,0xd0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1d00810 <unknown>
+
+fvdott  za.s[w11, 7, vgx4], {z30.b-z31.b}, z15.b[3]  // 11000001-11011111-01101111-11011111
+// CHECK-INST: fvdott  za.s[w11, 7, vgx4], { z30.b, z31.b }, z15.b[3]
+// CHECK-ENCODING: [0xdf,0x6f,0xdf,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1df6fdf <unknown>
diff --git a/llvm/test/MC/AArch64/FP8_SME2/mla-diagnostics.s b/llvm/test/MC/AArch64/FP8_SME2/mla-diagnostics.s
new file mode 100644
index 000000000000000..dc3d2d1ff3f0023
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/mla-diagnostics.s
@@ -0,0 +1,195 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16,+sme-f8f32  2>&1 < %s | FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid vector select register
+
+fmlal    za.h[w8, 0:1, vgx2], {z0.h-z1.h}, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlal    za.h[w8, 0:1, vgx2], {z0.h-z1.h}, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal    za.h[w11, 4:7], {z31.b-z2.b}, z15
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
+// CHECK-NEXT: fmlal    za.h[w11, 4:7], {z31.b-z2.b}, z15
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal    za.h[w11, 6:7, vgx2], {z28.b-z31.b}, {z0.b-z3.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlal    za.h[w11, 6:7, vgx2], {z28.b-z31.b}, {z0.b-z3.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall    za.s[w11, 0:3], {z29.b-z30.b}, {z30.b-z31.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlall    za.s[w11, 0:3], {z29.b-z30.b}, {z30.b-z31.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall    za.s[w11, 4:7], {z30.b-z0.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlall    za.s[w11, 4:7], {z30.b-z0.b}, z15.
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+// --------------------------------------------------------------------------//
+// Invalid vector select offset
+
+fmlal   za.h[w11, 1:2], {z30.b-z31.b}, z15.b[7]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector select offset must be an immediate range of the form <immf>:<imml>, where the first immediate is a multiple of 2 in the range [0, 6] or [0, 14] depending on the instruction, and the second immediate is immf + 1.
+// CHECK-NEXT: fmlal   za.h[w11, 1:2], {z30.b-z31.b}, z15.b[7]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal    za.h[w11, 3:4], {z28.b-z31.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector select offset must be an immediate range of the form <immf>:<imml>, where the first immediate is a multiple of 2 in the range [0, 6] or [0, 14] depending on the instruction, and the second immediate is immf + 1.
+// CHECK-NEXT: fmlal    za.h[w11, 3:4], {z28.b-z31.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal    za.h[w11, 7:8, vgx4], {z28.b-z31.b}, {z4.b-z7.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector select offset must be an immediate range of the form <immf>:<imml>, where the first immediate is a multiple of 2 in the range [0, 6] or [0, 14] depending on the instruction, and the second immediate is immf + 1.
+// CHECK-NEXT: fmlal    za.h[w11, 7:8, vgx4], {z28.b-z31.b}, {z4.b-z7.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall  za.s[w11, 3:6, vgx4], {z30.b-z31.b}, z15.b[3]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector select offset must be an immediate range of the form <immf>:<imml>, where the first immediate is a multiple of 4 in the range [0, 4] or [0, 12] depending on the instruction, and the second immediate is immf + 3.
+// CHECK-NEXT: fmlall  za.s[w11, 3:6, vgx4], {z30.b-z31.b}, z15.b[3]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall  za.s[w8, 3:6, vgx4], {z0.b-z3.b}, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector select offset must be an immediate range of the form <immf>:<imml>, where the first immediate is a multiple of 4 in the range [0, 4] or [0, 12] depending on the instruction, and the second immediate is immf + 3.
+// CHECK-NEXT: fmlall  za.s[w8, 3:6, vgx4], {z0.b-z3.b}, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall  za.s[w11, 7:10, vgx4], {z30.b-z31.b}, {z12.b-z13.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector select offset must be an immediate range of the form <immf>:<imml>, where the first immediate is a multiple of 4 in the range [0, 4] or [0, 12] depending on the instruction, and the second immediate is immf + 3.
+// CHECK-NEXT: fmlall  za.s[w11, 7:10, vgx4], {z30.b-z31.b}, {z12.b-z13.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vector list
+
+fmlal    za.h[w11, 4:7, vgx4], {z29.b-z1.b}, {z29.b-z1.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: fmlal    za.h[w11, 4:7, vgx4], {z29.b-z1.b}, {z29.b-z1.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal    za.h[w11, 4:7], {z30.b-z2.b}, {z0.b-z3.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: fmlal    za.h[w11, 4:7], {z30.b-z2.b}, {z0.b-z3.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall    za.s[w8, 0:1], {z31.b-z3.b}, {z31.b-z3.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: fmlall    za.s[w8, 0:1], {z31.b-z3.b}, {z31.b-z3.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall    za.s[w11, 6:7, vgx2], {z30.b-z31.b}, {z0.b-z4.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: fmlall    za.s[w11, 6:7, vgx2], {z30.b-z31.b}, {z0.b-z4.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid Register Suffix
+fmlal    za.d[w11, 4:5, vgx4], {z31.b-z2.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fmlal    za.d[w11, 4:5, vgx4], {z31.b-z2.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal    za[w11, 2:3], {z28.b-z31.b}, {z28.b-z31.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fmlal    za[w11, 2:3], {z28.b-z31.b}, {z28.b-z31.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal    za.b[w11, 6:7], {z31.b-z0.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fmlal    za.b[w11, 6:7], {z31.b-z0.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall    za.b[w11, 6:7, vgx2], {z30.h-z31.h}, {z30.h-z31.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fmlall    za.b[w11, 6:7, vgx2], {z30.h-z31.h}, {z30.h-z31.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall    za[w11, 4:7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fmlall    za[w11, 4:7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall    za.d[w11, 12:15], {z28.b-z31.b}, {z28.b-z31.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
+// CHECK-NEXT: fmlall    za.d[w11, 12:15], {z28.b-z31.b}, {z28.b-z31.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vector select register
+
+fmlal    za.h[w7, 4:7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [w8, w11]
+// CHECK-NEXT: fmlal    za.h[w7, 4:7, vgx4], {z31.b-z2.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal    za.h[w, 0:1, vgx2], {z0.b-z1.b}, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [w8, w11]
+// CHECK-NEXT: fmlal    za.h[w, 0:1, vgx2], {z0.b-z1.b}, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall    za.s[w12, 0:3], {z0.b-z3.b}, {z0.b-z3.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [w8, w11]
+// CHECK-NEXT: fmlall    za.s[w12, 0:3], {z0.b-z3.b}, {z0.b-z3.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid indexed-vector or single-vector register
+
+fmlal za.h[w8, 0:1], {z0.b-z1.b}, z16.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z15.b
+// CHECK-NEXT: fmlal za.h[w8, 0:1], {z0.b-z1.b}, z16.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal   za.h[w9, 14:15], z31.b, z16.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z15.b
+// CHECK-NEXT: fmlal   za.h[w9, 14:15], z31.b, z16.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall  za.s[w11, 8:11], z9.b, z16.b[13]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z15.b
+// CHECK-NEXT: fmlall  za.s[w11, 8:11], z9.b, z16.b[13]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall  za.s[w11, 12:15], z31.b, z16.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z15.b
+// CHECK-NEXT: fmlall  za.s[w11, 12:15], z31.b, z16.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vector grouping
+
+fmlal    za.h[w11, 10:11], {z28.b-z31.b}, {z0.b-z2.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlal    za.h[w11, 10:11], {z28.b-z31.b}, {z0.b-z2.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall    za.s[w11, 4:7, vgx4], {z31.b-z0.b}, z15.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlall    za.s[w11, 4:7, vgx4], {z31.b-z0.b}, z15.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid lane index
+
+fmlal   za.h[w11, 14:15], z31.b, z15.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlal   za.h[w11, 14:15], z31.b, z15.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlal    za.h[w11, 2:3], {z30.b-z31.b}, z15.b[16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlal    za.h[w11, 2:3], {z30.b-z31.b}, z15.b[16]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall  za.s[w9, 12:15], z12.b, z11.b[16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlall  za.s[w9, 12:15], z12.b, z11.b[16]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlall  za.s[w8, 4:7], {z16.b-z19.b}, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlall  za.s[w8, 4:7], {z16.b-z19.b}, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8_SME2/mla.s b/llvm/test/MC/AArch64/FP8_SME2/mla.s
new file mode 100644
index 000000000000000..a9a54860dc6c99a
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/mla.s
@@ -0,0 +1,361 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f8f16,+sme-f8f32 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme-f8f16,-sme-f8f32 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f8f16,+sme-f8f32 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+
+fmlal   za.h[w8, 0:1], z0.b, z0.b  // 11000001-00110000-00001100-00000000
+// CHECK-INST: fmlal   za.h[w8, 0:1], z0.b, z0.b
+// CHECK-ENCODING: [0x00,0x0c,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1300c00 <unknown>
+
+fmlal   za.h[w11, 14:15], z31.b, z15.b  // 11000001-00111111-01101111-11100111
+// CHECK-INST: fmlal   za.h[w11, 14:15], z31.b, z15.b
+// CHECK-ENCODING: [0xe7,0x6f,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c13f6fe7 <unknown>
+
+fmlal   za.h[w8, 0:1], z0.b, z0.b[0]  // 11000001-11000000-00000000-00000000
+// CHECK-INST: fmlal   za.h[w8, 0:1], z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0x00,0xc0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1c00000 <unknown>
+
+fmlal   za.h[w11, 14:15], z31.b, z15.b[15]  // 11000001-11001111-11101111-11101111
+// CHECK-INST: fmlal   za.h[w11, 14:15], z31.b, z15.b[15]
+// CHECK-ENCODING: [0xef,0xef,0xcf,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1cfefef <unknown>
+
+// x2
+
+fmlal   za.h[w8, 0:1, vgx2], {z0.b-z1.b}, z0.b  // 11000001-00100000-00001000-00000100
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx2], { z0.b, z1.b }, z0.b
+// CHECK-ENCODING: [0x04,0x08,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1200804 <unknown>
+
+fmlal   za.h[w8, 0:1], {z0.b-z1.b}, z0.b  // 11000001-00100000-00001000-00000100
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx2], { z0.b, z1.b }, z0.b
+// CHECK-ENCODING: [0x04,0x08,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1200804 <unknown>
+
+fmlal   za.h[w11, 6:7, vgx2], {z31.b-z0.b}, z15.b  // 11000001-00101111-01101011-11100111
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx2], { z31.b, z0.b }, z15.b
+// CHECK-ENCODING: [0xe7,0x6b,0x2f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c12f6be7 <unknown>
+
+fmlal   za.h[w11, 6:7], {z31.b-z0.b}, z15.b  // 11000001-00101111-01101011-11100111
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx2], { z31.b, z0.b }, z15.b
+// CHECK-ENCODING: [0xe7,0x6b,0x2f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c12f6be7 <unknown>
+
+fmlal   za.h[w8, 0:1, vgx2], {z0.b-z1.b}, {z0.b-z1.b}  // 11000001-10100000-00001000-00100000
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx2], { z0.b, z1.b }, { z0.b, z1.b }
+// CHECK-ENCODING: [0x20,0x08,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1a00820 <unknown>
+
+fmlal   za.h[w8, 0:1], {z0.b-z1.b}, {z0.b-z1.b}  // 11000001-10100000-00001000-00100000
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx2], { z0.b, z1.b }, { z0.b, z1.b }
+// CHECK-ENCODING: [0x20,0x08,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1a00820 <unknown>
+
+fmlal   za.h[w11, 6:7, vgx2], {z30.b-z31.b}, {z30.b-z31.b}  // 11000001-10111110-01101011-11100011
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx2], { z30.b, z31.b }, { z30.b, z31.b }
+// CHECK-ENCODING: [0xe3,0x6b,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1be6be3 <unknown>
+
+fmlal   za.h[w11, 6:7], {z30.b-z31.b}, {z30.b-z31.b}  // 11000001-10111110-01101011-11100011
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx2], { z30.b, z31.b }, { z30.b, z31.b }
+// CHECK-ENCODING: [0xe3,0x6b,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1be6be3 <unknown>
+
+fmlal   za.h[w8, 0:1, vgx2], {z0.b-z1.b}, z0.b[0]  // 11000001-10010000-00010000-00110000
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x30,0x10,0x90,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1901030 <unknown>
+
+fmlal   za.h[w8, 0:1], {z0.b-z1.b}, z0.b[0]  // 11000001-10010000-00010000-00110000
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x30,0x10,0x90,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1901030 <unknown>
+
+fmlal   za.h[w11, 6:7, vgx2], {z30.b-z31.b}, z15.b[15]  // 11000001-10011111-01111111-11111111
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx2], { z30.b, z31.b }, z15.b[15]
+// CHECK-ENCODING: [0xff,0x7f,0x9f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c19f7fff <unknown>
+
+fmlal   za.h[w11, 6:7], {z30.b-z31.b}, z15.b[15]  // 11000001-10011111-01111111-11111111
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx2], { z30.b, z31.b }, z15.b[15]
+// CHECK-ENCODING: [0xff,0x7f,0x9f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c19f7fff <unknown>
+
+// x4
+
+fmlal   za.h[w8, 0:1, vgx4], {z0.b-z3.b}, z0.b  // 11000001-00110000-00001000-00000100
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx4], { z0.b - z3.b }, z0.b
+// CHECK-ENCODING: [0x04,0x08,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1300804 <unknown>
+
+fmlal   za.h[w8, 0:1], {z0.b-z3.b}, z0.b  // 11000001-00110000-00001000-00000100
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx4], { z0.b - z3.b }, z0.b
+// CHECK-ENCODING: [0x04,0x08,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1300804 <unknown>
+
+fmlal   za.h[w11, 6:7, vgx4], {z31.b-z2.b}, z15.b  // 11000001-00111111-01101011-11100111
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx4], {  z31.b, z0.b, z1.b, z2.b  }, z15.b
+// CHECK-ENCODING: [0xe7,0x6b,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c13f6be7 <unknown>
+
+fmlal   za.h[w11, 6:7], {z31.b-z2.b}, z15.b  // 11000001-00111111-01101011-11100111
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx4], {  z31.b, z0.b, z1.b, z2.b  }, z15.b
+// CHECK-ENCODING: [0xe7,0x6b,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c13f6be7 <unknown>
+
+fmlal   za.h[w8, 0:1, vgx4], {z0.b-z3.b}, {z0.b-z3.b}  // 11000001-10100001-00001000-00100000
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx4], { z0.b - z3.b }, { z0.b - z3.b }
+// CHECK-ENCODING: [0x20,0x08,0xa1,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1a10820 <unknown>
+
+fmlal   za.h[w8, 0:1], {z0.b-z3.b}, {z0.b-z3.b}  // 11000001-10100001-00001000-00100000
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx4], { z0.b - z3.b }, { z0.b - z3.b }
+// CHECK-ENCODING: [0x20,0x08,0xa1,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1a10820 <unknown>
+
+fmlal   za.h[w11, 6:7, vgx4], {z28.b-z31.b}, {z28.b-z31.b}  // 11000001-10111101-01101011-10100011
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx4], { z28.b - z31.b }, { z28.b - z31.b }
+// CHECK-ENCODING: [0xa3,0x6b,0xbd,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1bd6ba3 <unknown>
+
+fmlal   za.h[w11, 6:7], {z28.b-z31.b}, {z28.b-z31.b}  // 11000001-10111101-01101011-10100011
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx4], { z28.b - z31.b }, { z28.b - z31.b }
+// CHECK-ENCODING: [0xa3,0x6b,0xbd,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1bd6ba3 <unknown>
+
+fmlal   za.h[w8, 0:1, vgx4], {z0.b-z3.b}, z0.b[0]  // 11000001-10010000-10010000-00100000
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx4], { z0.b - z3.b }, z0.b[0]
+// CHECK-ENCODING: [0x20,0x90,0x90,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1909020 <unknown>
+
+fmlal   za.h[w8, 0:1], {z0.b-z3.b}, z0.b[0]  // 11000001-10010000-10010000-00100000
+// CHECK-INST: fmlal   za.h[w8, 0:1, vgx4], { z0.b - z3.b }, z0.b[0]
+// CHECK-ENCODING: [0x20,0x90,0x90,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c1909020 <unknown>
+
+fmlal   za.h[w11, 6:7, vgx4], {z28.b-z31.b}, z15.b[15]  // 11000001-10011111-11111111-10101111
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx4], { z28.b - z31.b }, z15.b[15]
+// CHECK-ENCODING: [0xaf,0xff,0x9f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c19fffaf <unknown>
+
+fmlal   za.h[w11, 6:7], {z28.b-z31.b}, z15.b[15]  // 11000001-10011111-11111111-10101111
+// CHECK-INST: fmlal   za.h[w11, 6:7, vgx4], { z28.b - z31.b }, z15.b[15]
+// CHECK-ENCODING: [0xaf,0xff,0x9f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: c19fffaf <unknown>
+
+
+//FMLALL
+
+fmlall  za.s[w8, 0:3], z0.b, z0.b  // 11000001-00110000-00000100-00000000
+// CHECK-INST: fmlall  za.s[w8, 0:3], z0.b, z0.b
+// CHECK-ENCODING: [0x00,0x04,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1300400 <unknown>
+
+fmlall  za.s[w11, 12:15], z31.b, z15.b  // 11000001-00111111-01100111-11100011
+// CHECK-INST: fmlall  za.s[w11, 12:15], z31.b, z15.b
+// CHECK-ENCODING: [0xe3,0x67,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c13f67e3 <unknown>
+
+fmlall  za.s[w8, 0:3], z0.b, z0.b[0]  // 11000001-01000000-00000000-00000000
+// CHECK-INST: fmlall  za.s[w8, 0:3], z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0x00,0x40,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1400000 <unknown>
+
+fmlall  za.s[w11, 12:15], z31.b, z15.b[15]  // 11000001-01001111-11111111-11100011
+// CHECK-INST: fmlall  za.s[w11, 12:15], z31.b, z15.b[15]
+// CHECK-ENCODING: [0xe3,0xff,0x4f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c14fffe3 <unknown>
+
+// x2
+
+fmlall  za.s[w8, 0:3, vgx2], {z0.b-z1.b}, z0.b  // 11000001-00100000-00000000-00000010
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx2], { z0.b, z1.b }, z0.b
+// CHECK-ENCODING: [0x02,0x00,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1200002 <unknown>
+
+fmlall  za.s[w8, 0:3], {z0.b-z1.b}, z0.b  // 11000001-00100000-00000000-00000010
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx2], { z0.b, z1.b }, z0.b
+// CHECK-ENCODING: [0x02,0x00,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1200002 <unknown>
+
+fmlall  za.s[w11, 4:7, vgx2], {z31.b-z0.b}, z15.b  // 11000001-00101111-01100011-11100011
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx2], { z31.b, z0.b }, z15.b
+// CHECK-ENCODING: [0xe3,0x63,0x2f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c12f63e3 <unknown>
+
+fmlall  za.s[w11, 4:7], {z31.b-z0.b}, z15.b  // 11000001-00101111-01100011-11100011
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx2], { z31.b, z0.b }, z15.b
+// CHECK-ENCODING: [0xe3,0x63,0x2f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c12f63e3 <unknown>
+
+fmlall  za.s[w8, 0:3, vgx2], {z0.b-z1.b}, {z0.b-z1.b}  // 11000001-10100000-00000000-00100000
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx2], { z0.b, z1.b }, { z0.b, z1.b }
+// CHECK-ENCODING: [0x20,0x00,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1a00020 <unknown>
+
+fmlall  za.s[w8, 0:3], {z0.b-z1.b}, {z0.b-z1.b}  // 11000001-10100000-00000000-00100000
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx2], { z0.b, z1.b }, { z0.b, z1.b }
+// CHECK-ENCODING: [0x20,0x00,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1a00020 <unknown>
+
+fmlall  za.s[w11, 4:7, vgx2], {z30.b-z31.b}, {z30.b-z31.b}  // 11000001-10111110-01100011-11100001
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx2], { z30.b, z31.b }, { z30.b, z31.b }
+// CHECK-ENCODING: [0xe1,0x63,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1be63e1 <unknown>
+
+fmlall  za.s[w11, 4:7], {z30.b-z31.b}, {z30.b-z31.b}  // 11000001-10111110-01100011-11100001
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx2], { z30.b, z31.b }, { z30.b, z31.b }
+// CHECK-ENCODING: [0xe1,0x63,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1be63e1 <unknown>
+
+fmlall  za.s[w8, 0:3, vgx2], {z0.b-z1.b}, z0.b[0]  // 11000001-10010000-00000000-00100000
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x20,0x00,0x90,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1900020 <unknown>
+
+fmlall  za.s[w8, 0:3], {z0.b-z1.b}, z0.b[0]  // 11000001-10010000-00000000-00100000
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx2], { z0.b, z1.b }, z0.b[0]
+// CHECK-ENCODING: [0x20,0x00,0x90,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1900020 <unknown>
+
+fmlall  za.s[w11, 4:7, vgx2], {z30.b-z31.b}, z15.b[15]  // 11000001-10011111-01101111-11100111
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx2], { z30.b, z31.b }, z15.b[15]
+// CHECK-ENCODING: [0xe7,0x6f,0x9f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c19f6fe7 <unknown>
+
+fmlall  za.s[w11, 4:7], {z30.b-z31.b}, z15.b[15]  // 11000001-10011111-01101111-11100111
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx2], { z30.b, z31.b }, z15.b[15]
+// CHECK-ENCODING: [0xe7,0x6f,0x9f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c19f6fe7 <unknown>
+
+// x4
+
+fmlall  za.s[w8, 0:3, vgx4], {z0.b-z3.b}, z0.b  // 11000001-00110000-00000000-00000010
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx4], { z0.b - z3.b }, z0.b
+// CHECK-ENCODING: [0x02,0x00,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1300002 <unknown>
+
+fmlall  za.s[w8, 0:3], {z0.b-z3.b}, z0.b  // 11000001-00110000-00000000-00000010
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx4], { z0.b - z3.b }, z0.b
+// CHECK-ENCODING: [0x02,0x00,0x30,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1300002 <unknown>
+
+fmlall  za.s[w11, 4:7, vgx4], {z31.b-z2.b}, z15.b  // 11000001-00111111-01100011-11100011
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx4], {  z31.b, z0.b, z1.b, z2.b  }, z15.b
+// CHECK-ENCODING: [0xe3,0x63,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c13f63e3 <unknown>
+
+fmlall  za.s[w11, 4:7], {z31.b-z2.b}, z15.b  // 11000001-00111111-01100011-11100011
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx4], {  z31.b, z0.b, z1.b, z2.b  }, z15.b
+// CHECK-ENCODING: [0xe3,0x63,0x3f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c13f63e3 <unknown>
+
+fmlall  za.s[w8, 0:3, vgx4], {z0.b-z3.b}, {z0.b-z3.b}  // 11000001-10100001-00000000-00100000
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx4], { z0.b - z3.b }, { z0.b - z3.b }
+// CHECK-ENCODING: [0x20,0x00,0xa1,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1a10020 <unknown>
+
+fmlall  za.s[w8, 0:3], {z0.b-z3.b}, {z0.b-z3.b}  // 11000001-10100001-00000000-00100000
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx4], { z0.b - z3.b }, { z0.b - z3.b }
+// CHECK-ENCODING: [0x20,0x00,0xa1,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1a10020 <unknown>
+
+fmlall  za.s[w11, 4:7, vgx4], {z28.b-z31.b}, {z28.b-z31.b}  // 11000001-10111101-01100011-10100001
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx4], { z28.b - z31.b }, { z28.b - z31.b }
+// CHECK-ENCODING: [0xa1,0x63,0xbd,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1bd63a1 <unknown>
+
+fmlall  za.s[w11, 4:7], {z28.b-z31.b}, {z28.b-z31.b}  // 11000001-10111101-01100011-10100001
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx4], { z28.b - z31.b }, { z28.b - z31.b }
+// CHECK-ENCODING: [0xa1,0x63,0xbd,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1bd63a1 <unknown>
+
+fmlall  za.s[w8, 0:3, vgx4], {z0.b-z3.b}, z0.b[0]  // 11000001-00010000-10000000-01000000
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx4], { z0.b - z3.b }, z0.b[0]
+// CHECK-ENCODING: [0x40,0x80,0x10,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1108040 <unknown>
+
+fmlall  za.s[w8, 0:3], {z0.b-z3.b}, z0.b[0]  // 11000001-00010000-10000000-01000000
+// CHECK-INST: fmlall  za.s[w8, 0:3, vgx4], { z0.b - z3.b }, z0.b[0]
+// CHECK-ENCODING: [0x40,0x80,0x10,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c1108040 <unknown>
+
+fmlall  za.s[w11, 4:7, vgx4], {z28.b-z31.b}, z15.b[15]  // 11000001-00011111-11101111-11000111
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx4], { z28.b - z31.b }, z15.b[15]
+// CHECK-ENCODING: [0xc7,0xef,0x1f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c11fefc7 <unknown>
+
+fmlall  za.s[w11, 4:7], {z28.b-z31.b}, z15.b[15]  // 11000001-00011111-11101111-11000111
+// CHECK-INST: fmlall  za.s[w11, 4:7, vgx4], { z28.b - z31.b }, z15.b[15]
+// CHECK-ENCODING: [0xc7,0xef,0x1f,0xc1]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: c11fefc7 <unknown>
diff --git a/llvm/test/MC/AArch64/FP8_SME2/mopa-diagnostics.s b/llvm/test/MC/AArch64/FP8_SME2/mopa-diagnostics.s
new file mode 100644
index 000000000000000..1c788f6d04c4094
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/mopa-diagnostics.s
@@ -0,0 +1,46 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16,+sme-f8f32 2>&1 < %s | FileCheck %s
+
+
+// --------------------------------------------------------------------------//
+// Invalid predicate register
+
+fmopa   za0.h, p8/m, p0/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: fmopa   za0.h, p8/m, p0/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmopa   za0.h, p0/m, p8/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: fmopa   za0.h, p0/m, p8/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmopa   za0.s, p8/m, p0/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: fmopa   za0.s, p8/m, p0/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmopa   za3.s, p7/m, p8/m, z31.b, z31.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: fmopa   za3.s, p7/m, p8/m, z31.b, z31.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid matrix operand
+
+fmopa   za2.h, p0/m, p0/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmopa   za2.h, p0/m, p0/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid register suffixes
+
+fmopa   za0.h, p0/m, p0/m, z0.b, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmopa   za0.h, p0/m, p0/m, z0.b, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmopa   za3.s, p7/m, p0/m, z31.b, z31.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmopa   za3.s, p7/m, p0/m, z31.b, z31.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8_SME2/mopa.s b/llvm/test/MC/AArch64/FP8_SME2/mopa.s
new file mode 100644
index 000000000000000..23d923949c7cf0f
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/mopa.s
@@ -0,0 +1,39 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f8f16,+sme-f8f32 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme-f8f16,-sme-f8f32 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16,+sme-f8f32 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f8f16,+sme-f8f32 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmopa   za0.h, p0/m, p0/m, z0.b, z0.b  // 10000000-10100000-00000000-00001000
+// CHECK-INST: fmopa   za0.h, p0/m, p0/m, z0.b, z0.b
+// CHECK-ENCODING: [0x08,0x00,0xa0,0x80]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: 80a00008 <unknown>
+
+
+fmopa   za1.h, p7/m, p7/m, z31.b, z31.b  // 10000000-10111111-11111111-11101001
+// CHECK-INST: fmopa   za1.h, p7/m, p7/m, z31.b, z31.b
+// CHECK-ENCODING: [0xe9,0xff,0xbf,0x80]
+// CHECK-ERROR: instruction requires: sme-f8f16
+// CHECK-UNKNOWN: 80bfffe9 <unknown>
+
+
+fmopa   za0.s, p0/m, p0/m, z0.b, z0.b  // 10000000-10100000-00000000-00000000
+// CHECK-INST: fmopa   za0.s, p0/m, p0/m, z0.b, z0.b
+// CHECK-ENCODING: [0x00,0x00,0xa0,0x80]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: 80a00000 <unknown>
+
+fmopa   za3.s, p7/m, p7/m, z31.b, z31.b  // 10000000-10111111-11111111-11100011
+// CHECK-INST: fmopa   za3.s, p7/m, p7/m, z31.b, z31.b
+// CHECK-ENCODING: [0xe3,0xff,0xbf,0x80]
+// CHECK-ERROR: instruction requires: sme-f8f32
+// CHECK-UNKNOWN: 80bfffe3 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2/fmlal-diagnostics.s b/llvm/test/MC/AArch64/SME2/fmlal-diagnostics.s
index 3fa960bcc4d29b9..ebc725606fb1fb3 100644
--- a/llvm/test/MC/AArch64/SME2/fmlal-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/fmlal-diagnostics.s
@@ -55,9 +55,9 @@ fmlal za.s[w8, 9:10, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // --------------------------------------------------------------------------//
 // Invalid Register Suffix
 
-fmlal za.h[w8, 6:7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
+fmlal za.d[w8, 6:7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
-// CHECK-NEXT: fmlal za.h[w8, 6:7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
+// CHECK-NEXT: fmlal za.d[w8, 6:7, vgx2], {z12.h-z13.h}, {z8.h-z9.h}
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 // --------------------------------------------------------------------------//
diff --git a/llvm/test/MC/AArch64/SME2/fvdot-diagnostics.s b/llvm/test/MC/AArch64/SME2/fvdot-diagnostics.s
index d991207ee1a739b..696798eaaa0786a 100644
--- a/llvm/test/MC/AArch64/SME2/fvdot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2/fvdot-diagnostics.s
@@ -42,9 +42,9 @@ fvdot za.s[w8, 0, vgx2], {z1.h-z2.h}, z0.h[0]
 // --------------------------------------------------------------------------//
 // Invalid Matrix Operand
 
-fvdot za.h[w8, 0, vgx2], {z0.h-z2.h}, z0.h[0]
+fvdot za.b[w8, 0, vgx2], {z0.h-z2.h}, z0.h[0]
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected suffix .s
-// CHECK-NEXT: fvdot za.h[w8, 0, vgx2], {z0.h-z2.h}, z0.h[0]
+// CHECK-NEXT: fvdot za.b[w8, 0, vgx2], {z0.h-z2.h}, z0.h[0]
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 // --------------------------------------------------------------------------//
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 16ea62a621d2ae7..6b85ee0ff664b84 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1746,7 +1746,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SSVE_FP8FMA,  AArch64::AEK_FP8DOT2,
       AArch64::AEK_SSVE_FP8DOT2, AArch64::AEK_FP8DOT4,
       AArch64::AEK_SSVE_FP8DOT4, AArch64::AEK_LUT,
-      AArch64::AEK_SME_LUTv2};
+      AArch64::AEK_SME_LUTv2,    AArch64::AEK_SMEF8F16,
+      AArch64::AEK_SMEF8F32};
 
   std::vector<StringRef> Features;
 
@@ -1829,6 +1830,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+ssve-fp8dot4"));
   EXPECT_TRUE(llvm::is_contained(Features, "+lut"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-lutv2"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sme-f8f16"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sme-f8f32"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the
@@ -1963,6 +1966,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"ssve-fp8dot4", "nossve-fp8dot4", "+ssve-fp8dot4", "-ssve-fp8dot4"},
       {"lut", "nolut", "+lut", "-lut"},
       {"sme-lutv2", "nosme-lutv2", "+sme-lutv2", "-sme-lutv2"},
+      {"sme-f8f16", "nosme-f8f16", "+sme-f8f16", "-sme-f8f16"},
+      {"sme-f8f32", "nosme-f8f32", "+sme-f8f32", "-sme-f8f32"},
   };
 
   for (unsigned i = 0; i < std::size(ArchExt); i++) {

>From 05a0d9441630ff417d6e4185b526478967d1fe15 Mon Sep 17 00:00:00 2001
From: Ben Langmuir <blangmuir at apple.com>
Date: Fri, 3 Nov 2023 08:32:08 -0700
Subject: [PATCH 36/76] [orc][mach-o] Fix mixing objc and swift code in a
 single JITDylib (#69258)

The system linker merges __objc_imageinfo flags values to select a
compatible set of flags using the minimum swift version and only
erroring on incompatible ABIs. Match that behaviour in the orc macho
platform. One wrinkle is that the JIT can add new objects after the
dylib is running code. In that case we only check for known incompatible
flags and ignore the swift version. It's too late to change the flags at
that point and swift version is unlikely to change runtime behaviour in
practice.
---
 .../TestCases/Darwin/arm64/objc-imageinfo.S   | 111 ++++++++++++++++++
 .../TestCases/Darwin/x86-64/objc-imageinfo.S  | 111 ++++++++++++++++++
 .../llvm/ExecutionEngine/Orc/MachOPlatform.h  |   6 +
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 111 +++++++++++++++++-
 4 files changed, 335 insertions(+), 4 deletions(-)
 create mode 100644 compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S
 create mode 100644 compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S

diff --git a/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S b/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S
new file mode 100644
index 000000000000000..d58943f9681dadb
--- /dev/null
+++ b/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S
@@ -0,0 +1,111 @@
+// Test merging of __objc_imageinfo flags and ensure we can run mixed objc and
+// swift code in a single jit dylib.
+
+// REQUIRES: system-darwin && asserts
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: (cd %t; %clang -c *.S)
+
+// Check individual versions are loadable.
+
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_old.o 2>&1 | FileCheck %s -check-prefix=OLD
+// OLD: MachOPlatform: Registered __objc_imageinfo for main
+// OLD-SAME: flags = 0x0000
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=NEW
+// NEW: MachOPlatform: Registered __objc_imageinfo for main
+// NEW-SAME: flags = 0x0040
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_4.o 2>&1 | FileCheck %s -check-prefix=SWIFT_4
+// SWIFT_4: MachOPlatform: Registered __objc_imageinfo for main
+// SWIFT_4-SAME: flags = 0x0640
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_5
+// SWIFT_5: MachOPlatform: Registered __objc_imageinfo for main
+// SWIFT_5-SAME: flags = 0x5000740
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o 2>&1 | FileCheck %s -check-prefix=SWIFT_59
+// SWIFT_59: MachOPlatform: Registered __objc_imageinfo for main
+// SWIFT_59-SAME: flags = 0x5090740
+
+// Check error conditions.
+
+// RUN: not %llvm_jitlink %t/main.o %t/objc_old.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=CATEGORY
+// CATEGORY: ObjC category class property support in {{.*}} does not match first registered flags
+
+// RUN: not %llvm_jitlink %t/main.o %t/swift_4.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_ABI
+// SWIFT_ABI: Swift ABI version in {{.*}} does not match first registered flags
+
+// Check merging.
+
+// Take the lowest swift version.
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX1
+// SWIFT_MIX1: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740
+
+// Add swift to objc.
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX2
+// SWIFT_MIX2: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5090740
+
+// Add multiple swift to objc.
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/swift_5.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX3
+// SWIFT_MIX3: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740
+
+//--- main.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _main
+_main:
+  mov w0, #0
+  ret
+
+//--- objc_old.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _objc1
+_objc1:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 0
+
+//--- objc_new.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _objc2
+_objc2:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 64
+
+//--- swift_4.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _swift4
+_swift4:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 1600
+
+//--- swift_5.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _swift5
+_swift5:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 83887936
+
+//--- swift_59.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _swift59
+_swift59:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 84477760
+
diff --git a/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S b/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S
new file mode 100644
index 000000000000000..90b5c3a38eebe05
--- /dev/null
+++ b/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S
@@ -0,0 +1,111 @@
+// Test merging of __objc_imageinfo flags and ensure we can run mixed objc and
+// swift code in a single jit dylib.
+
+// REQUIRES: system-darwin && asserts
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: (cd %t; %clang -c *.S)
+
+// Check individual versions are loadable.
+
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_old.o 2>&1 | FileCheck %s -check-prefix=OLD
+// OLD: MachOPlatform: Registered __objc_imageinfo for main
+// OLD-SAME: flags = 0x0000
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=NEW
+// NEW: MachOPlatform: Registered __objc_imageinfo for main
+// NEW-SAME: flags = 0x0040
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_4.o 2>&1 | FileCheck %s -check-prefix=SWIFT_4
+// SWIFT_4: MachOPlatform: Registered __objc_imageinfo for main
+// SWIFT_4-SAME: flags = 0x0640
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_5
+// SWIFT_5: MachOPlatform: Registered __objc_imageinfo for main
+// SWIFT_5-SAME: flags = 0x5000740
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o 2>&1 | FileCheck %s -check-prefix=SWIFT_59
+// SWIFT_59: MachOPlatform: Registered __objc_imageinfo for main
+// SWIFT_59-SAME: flags = 0x5090740
+
+// Check error conditions.
+
+// RUN: not %llvm_jitlink %t/main.o %t/objc_old.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=CATEGORY
+// CATEGORY: ObjC category class property support in {{.*}} does not match first registered flags
+
+// RUN: not %llvm_jitlink %t/main.o %t/swift_4.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_ABI
+// SWIFT_ABI: Swift ABI version in {{.*}} does not match first registered flags
+
+// Check merging.
+
+// Take the lowest swift version.
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/swift_5.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX1
+// SWIFT_MIX1: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740
+
+// Add swift to objc.
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX2
+// SWIFT_MIX2: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5090740
+
+// Add multiple swift to objc.
+// RUN: %llvm_jitlink -debug-only=orc %t/main.o %t/swift_59.o %t/swift_5.o %t/objc_new.o 2>&1 | FileCheck %s -check-prefix=SWIFT_MIX3
+// SWIFT_MIX3: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740
+
+//--- main.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _main
+_main:
+  xorl %eax, %eax
+  ret
+
+//--- objc_old.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _objc1
+_objc1:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 0
+
+//--- objc_new.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _objc2
+_objc2:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 64
+
+//--- swift_4.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _swift4
+_swift4:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 1600
+
+//--- swift_5.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _swift5
+_swift5:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 83887936
+
+//--- swift_59.S
+.section  __TEXT,__text,regular,pure_instructions
+.globl _swift59
+_swift59:
+  ret
+
+  .section  __DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+  .long 0
+  .long 84477760
+
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index b9e6279432e5b24..d4af6c9234e3a42 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -159,6 +159,9 @@ class MachOPlatform : public Platform {
     struct ObjCImageInfo {
       uint32_t Version = 0;
       uint32_t Flags = 0;
+      /// Whether this image info can no longer be mutated, as it may have been
+      /// registered with the objc runtime.
+      bool Finalized = false;
     };
 
     Error bootstrapPipelineStart(jitlink::LinkGraph &G);
@@ -173,6 +176,9 @@ class MachOPlatform : public Platform {
 
     Error processObjCImageInfo(jitlink::LinkGraph &G,
                                MaterializationResponsibility &MR);
+    Error mergeImageInfoFlags(jitlink::LinkGraph &G,
+                              MaterializationResponsibility &MR,
+                              ObjCImageInfo &Info, uint32_t NewFlags);
 
     Error fixTLVSectionsAndEdges(jitlink::LinkGraph &G, JITDylib &JD);
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index abc0bbbcaad2a39..6d8e2396137337b 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -266,6 +266,33 @@ static StringRef ObjCRuntimeObjectSectionName =
 static StringRef ObjCImageInfoSymbolName =
     "__llvm_jitlink_macho_objc_imageinfo";
 
+struct ObjCImageInfoFlags {
+  uint16_t SwiftABIVersion;
+  uint16_t SwiftVersion;
+  bool HasCategoryClassProperties;
+  bool HasSignedObjCClassROs;
+
+  static constexpr uint32_t SIGNED_CLASS_RO = (1 << 4);
+  static constexpr uint32_t HAS_CATEGORY_CLASS_PROPERTIES = (1 << 6);
+
+  explicit ObjCImageInfoFlags(uint32_t RawFlags) {
+    HasSignedObjCClassROs = RawFlags & SIGNED_CLASS_RO;
+    HasCategoryClassProperties = RawFlags & HAS_CATEGORY_CLASS_PROPERTIES;
+    SwiftABIVersion = (RawFlags >> 8) & 0xFF;
+    SwiftVersion = (RawFlags >> 16) & 0xFFFF;
+  }
+
+  uint32_t rawFlags() const {
+    uint32_t Result = 0;
+    if (HasCategoryClassProperties)
+      Result |= HAS_CATEGORY_CLASS_PROPERTIES;
+    if (HasSignedObjCClassROs)
+      Result |= SIGNED_CLASS_RO;
+    Result |= (SwiftABIVersion << 8);
+    Result |= (SwiftVersion << 16);
+    return Result;
+  }
+};
 } // end anonymous namespace
 
 namespace llvm {
@@ -1029,15 +1056,19 @@ Error MachOPlatform::MachOPlatformPlugin::processObjCImageInfo(
               " does not match first registered version",
           inconvertibleErrorCode());
     if (ObjCImageInfoItr->second.Flags != Flags)
-      return make_error<StringError>("ObjC flags in " + G.getName() +
-                                         " do not match first registered flags",
-                                     inconvertibleErrorCode());
+      if (Error E = mergeImageInfoFlags(G, MR, ObjCImageInfoItr->second, Flags))
+        return E;
 
     // __objc_imageinfo is valid. Delete the block.
     for (auto *S : ObjCImageInfo->symbols())
       G.removeDefinedSymbol(*S);
     G.removeBlock(ObjCImageInfoBlock);
   } else {
+    LLVM_DEBUG({
+      dbgs() << "MachOPlatform: Registered __objc_imageinfo for "
+             << MR.getTargetJITDylib().getName() << " in " << G.getName()
+             << "; flags = " << formatv("{0:x4}", Flags) << "\n";
+    });
     // We haven't registered an __objc_imageinfo section yet. Register and
     // move on. The section should already be marked no-dead-strip.
     G.addDefinedSymbol(ObjCImageInfoBlock, 0, ObjCImageInfoSymbolName,
@@ -1047,12 +1078,66 @@ Error MachOPlatform::MachOPlatformPlugin::processObjCImageInfo(
             {{MR.getExecutionSession().intern(ObjCImageInfoSymbolName),
               JITSymbolFlags()}}))
       return Err;
-    ObjCImageInfos[&MR.getTargetJITDylib()] = {Version, Flags};
+    ObjCImageInfos[&MR.getTargetJITDylib()] = {Version, Flags, false};
   }
 
   return Error::success();
 }
 
+Error MachOPlatform::MachOPlatformPlugin::mergeImageInfoFlags(
+    jitlink::LinkGraph &G, MaterializationResponsibility &MR,
+    ObjCImageInfo &Info, uint32_t NewFlags) {
+  if (Info.Flags == NewFlags)
+    return Error::success();
+
+  ObjCImageInfoFlags Old(Info.Flags);
+  ObjCImageInfoFlags New(NewFlags);
+
+  // Check for incompatible flags.
+  if (Old.SwiftABIVersion && New.SwiftABIVersion &&
+      Old.SwiftABIVersion != New.SwiftABIVersion)
+    return make_error<StringError>("Swift ABI version in " + G.getName() +
+                                       " does not match first registered flags",
+                                   inconvertibleErrorCode());
+
+  if (Old.HasCategoryClassProperties != New.HasCategoryClassProperties)
+    return make_error<StringError>("ObjC category class property support in " +
+                                       G.getName() +
+                                       " does not match first registered flags",
+                                   inconvertibleErrorCode());
+  if (Old.HasSignedObjCClassROs != New.HasSignedObjCClassROs)
+    return make_error<StringError>("ObjC class_ro_t pointer signing in " +
+                                       G.getName() +
+                                       " does not match first registered flags",
+                                   inconvertibleErrorCode());
+
+  // If we cannot change the flags, ignore any remaining differences. Adding
+  // Swift or changing its version are unlikely to cause problems in practice.
+  if (Info.Finalized)
+    return Error::success();
+
+  // Use the minimum Swift version.
+  if (Old.SwiftVersion && New.SwiftVersion)
+    New.SwiftVersion = std::min(Old.SwiftVersion, New.SwiftVersion);
+  else if (Old.SwiftVersion)
+    New.SwiftVersion = Old.SwiftVersion;
+  // Add a Swift ABI version if it was pure objc before.
+  if (!New.SwiftABIVersion)
+    New.SwiftABIVersion = Old.SwiftABIVersion;
+
+  LLVM_DEBUG({
+    dbgs() << "MachOPlatform: Merging __objc_imageinfo flags for "
+           << MR.getTargetJITDylib().getName() << " (was "
+           << formatv("{0:x4}", Old.rawFlags()) << ")"
+           << " with " << G.getName() << " (" << formatv("{0:x4}", NewFlags)
+           << ")"
+           << " -> " << formatv("{0:x4}", New.rawFlags()) << "\n";
+  });
+
+  Info.Flags = New.rawFlags();
+  return Error::success();
+}
+
 Error MachOPlatform::MachOPlatformPlugin::fixTLVSectionsAndEdges(
     jitlink::LinkGraph &G, JITDylib &JD) {
 
@@ -1403,6 +1488,24 @@ Error MachOPlatform::MachOPlatformPlugin::populateObjCRuntimeObject(
         for (auto *Sym : G.defined_symbols())
           if (Sym->hasName() && Sym->getName() == ObjCImageInfoSymbolName) {
             ObjCImageInfoSym = Sym;
+            std::optional<uint32_t> Flags;
+            {
+              std::lock_guard<std::mutex> Lock(PluginMutex);
+              auto It = ObjCImageInfos.find(&MR.getTargetJITDylib());
+              if (It != ObjCImageInfos.end()) {
+                It->second.Finalized = true;
+                Flags = It->second.Flags;
+              }
+            }
+
+            if (Flags) {
+              // We own the definition of __objc_image_info; write the final
+              // merged flags value.
+              auto Content = Sym->getBlock().getMutableContent(G);
+              assert(Content.size() == 8 &&
+                  "__objc_image_info size should have been verified already");
+              support::endian::write32(&Content[4], *Flags, G.getEndianness());
+            }
             break;
           }
       if (!ObjCImageInfoSym)

>From de88371d9d62eac598f8603b9a2aee6cbce4fe21 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 3 Nov 2023 15:34:45 +0000
Subject: [PATCH 37/76] [LLVM][AArch64] Add ASM constraints for reduced GPR
 register ranges. (#70970)

[LLVM][AArch64] Add ASM constraints for reduced GPR register ranges.

The patch adds the follow ASM constraints:
  Uci => w8-w11/x8-x11
  Ucj => w12-w15/x12-x15

These constraints are required for SME load/store instructions
where a reduced set of GPRs are used to specify ZA array vectors.

NOTE: GCC has agreed to use the same constraint syntax.
---
 clang/docs/ReleaseNotes.rst                   |  2 +
 clang/lib/Basic/Targets/AArch64.cpp           |  6 ++
 clang/test/CodeGen/aarch64-inline-asm.c       | 15 ++++
 llvm/docs/LangRef.rst                         |  2 +
 .../Target/AArch64/AArch64ISelLowering.cpp    | 34 +++++++-
 .../AArch64/inlineasm-Uc-constraint.ll        | 78 +++++++++++++++++++
 6 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4696836b3a00caa..afe7e2e79c2d087 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -738,6 +738,8 @@ Arm and AArch64 Support
   This affects C++ functions with SVE ACLE parameters. Clang will use the old
   manglings if ``-fclang-abi-compat=17`` or lower is  specified.
 
+- New AArch64 asm constraints have been added for r8-r11(Uci) and r12-r15(Ucj).
+
 Android Support
 ^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index fe5a7af97b7753c..c71af71eba60ce2 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -1306,6 +1306,12 @@ bool AArch64TargetInfo::validateAsmConstraint(
       Name += 2;
       return true;
     }
+    if (Name[1] == 'c' && (Name[2] == 'i' || Name[2] == 'j')) {
+      // Gpr registers ("Uci"=w8-11, "Ucj"=w12-15)
+      Info.setAllowsRegister();
+      Name += 2;
+      return true;
+    }
     // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes.
     // Utf: A memory address suitable for ldp/stp in TF mode.
     // Usa: An absolute symbolic address.
diff --git a/clang/test/CodeGen/aarch64-inline-asm.c b/clang/test/CodeGen/aarch64-inline-asm.c
index 439fb9e33f9ae15..75e9a8c46b87692 100644
--- a/clang/test/CodeGen/aarch64-inline-asm.c
+++ b/clang/test/CodeGen/aarch64-inline-asm.c
@@ -80,3 +80,18 @@ void test_tied_earlyclobber(void) {
   asm("" : "+&r"(a));
   // CHECK: call i32 asm "", "=&{x1},0"(i32 %0)
 }
+
+void test_reduced_gpr_constraints(int var32, long var64) {
+  asm("add w0, w0, %0" : : "Uci"(var32) : "w0");
+// CHECK: [[ARG1:%.+]] = load i32, ptr
+// CHECK: call void asm sideeffect "add w0, w0, $0", "@3Uci,~{w0}"(i32 [[ARG1]])
+  asm("add x0, x0, %0" : : "Uci"(var64) : "x0");
+// CHECK: [[ARG1:%.+]] = load i64, ptr
+// CHECK: call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i64 [[ARG1]])
+  asm("add w0, w0, %0" : : "Ucj"(var32) : "w0");
+// CHECK: [[ARG2:%.+]] = load i32, ptr
+// CHECK: call void asm sideeffect "add w0, w0, $0", "@3Ucj,~{w0}"(i32 [[ARG2]])
+  asm("add x0, x0, %0" : : "Ucj"(var64) : "x0");
+// CHECK: [[ARG2:%.+]] = load i64, ptr
+// CHECK: call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i64 [[ARG2]])
+}
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 6fd483276a301c7..1e9d42ed0a06079 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -5094,6 +5094,8 @@ AArch64:
   offsets). (However, LLVM currently does this for the ``m`` constraint as
   well.)
 - ``r``: A 32 or 64-bit integer register (W* or X*).
+- ``Uci``: Like r, but restricted to registers 8 to 11 inclusive.
+- ``Ucj``: Like r, but restricted to registers 12 to 15 inclusive.
 - ``w``: A 32, 64, or 128-bit floating-point, SIMD or SVE vector register.
 - ``x``: Like w, but restricted to registers 0 to 15 inclusive.
 - ``y``: Like w, but restricted to SVE vector registers Z0 to Z7 inclusive.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 94901c2d1a65688..f5193a9f2adf30c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10195,6 +10195,31 @@ getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
   llvm_unreachable("Missing PredicateConstraint!");
 }
 
+enum class ReducedGprConstraint { Uci, Ucj };
+
+static std::optional<ReducedGprConstraint>
+parseReducedGprConstraint(StringRef Constraint) {
+  return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
+      .Case("Uci", ReducedGprConstraint::Uci)
+      .Case("Ucj", ReducedGprConstraint::Ucj)
+      .Default(std::nullopt);
+}
+
+static const TargetRegisterClass *
+getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
+  if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
+    return nullptr;
+
+  switch (Constraint) {
+  case ReducedGprConstraint::Uci:
+    return &AArch64::MatrixIndexGPR32_8_11RegClass;
+  case ReducedGprConstraint::Ucj:
+    return &AArch64::MatrixIndexGPR32_12_15RegClass;
+  }
+
+  llvm_unreachable("Missing ReducedGprConstraint!");
+}
+
 // The set of cc code supported is from
 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
 static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
@@ -10292,6 +10317,8 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
     }
   } else if (parsePredicateConstraint(Constraint))
     return C_RegisterClass;
+  else if (parseReducedGprConstraint(Constraint))
+    return C_RegisterClass;
   else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
     return C_Other;
   return TargetLowering::getConstraintType(Constraint);
@@ -10325,7 +10352,8 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
     weight = CW_Constant;
     break;
   case 'U':
-    if (parsePredicateConstraint(constraint))
+    if (parsePredicateConstraint(constraint) ||
+        parseReducedGprConstraint(constraint))
       weight = CW_Register;
     break;
   }
@@ -10385,6 +10413,10 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
     if (const auto PC = parsePredicateConstraint(Constraint))
       if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
         return std::make_pair(0U, RegClass);
+
+    if (const auto RGC = parseReducedGprConstraint(Constraint))
+      if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
+        return std::make_pair(0U, RegClass);
   }
   if (StringRef("{cc}").equals_insensitive(Constraint) ||
       parseConstraintCode(Constraint) != AArch64CC::Invalid)
diff --git a/llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll b/llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll
new file mode 100644
index 000000000000000..0bee7ea40cc1aeb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -o - | FileCheck %s
+
+target triple = "arm64-none-linux-gnu"
+
+define void @test_constraints_Uci_w(i32 %a) {
+; CHECK-LABEL: test_constraints_Uci_w:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x8
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i32 %a)
+  ret void
+}
+
+; As test_constraints_Uci_w but ensures non-legal types are also covered.
+define void @test_constraints_Uci_w_i8(i8 %a) {
+; CHECK-LABEL: test_constraints_Uci_w_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x8
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i8 %a)
+  ret void
+}
+
+define void @test_constraints_Uci_x(i64 %a) {
+; CHECK-LABEL: test_constraints_Uci_x:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x8
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i64 %a)
+  ret void
+}
+
+define void @test_constraint_Ucj_w(i32 %a) {
+; CHECK-LABEL: test_constraint_Ucj_w:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x12
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i32 %a)
+  ret void
+}
+
+; As test_constraints_Ucj_w but ensures non-legal types are also covered.
+define void @test_constraint_Ucj_w_i8(i8 %a) {
+; CHECK-LABEL: test_constraint_Ucj_w_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x12
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i8 %a)
+  ret void
+}
+
+define void @test_constraint_Ucj_x(i64 %a) {
+; CHECK-LABEL: test_constraint_Ucj_x:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x12, x0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x12
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i64 %a)
+  ret void
+}

>From 51018d1a90542a407c78868e6be29a2492c18f5a Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me at piotrzegar.pl>
Date: Fri, 3 Nov 2023 16:39:09 +0100
Subject: [PATCH 38/76] [clang-tidy] Improve modernize-make-shared check
 (#70600)

Improved modernize-make-shared check to support
std::shared_ptr implementations that inherit the
reset method from a base class.
In GCC that class is called __shared_ptr.

Fixes #64481
---
 .../modernize/MakeSmartPtrCheck.cpp           | 20 ++++++++------
 clang-tools-extra/docs/ReleaseNotes.rst       |  5 ++++
 .../modernize/Inputs/smart-ptr/shared_ptr.h   | 27 ++++++++++++-------
 3 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
index 2f9f47d3f6c3e85..71fd8eca300c1b2 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
@@ -96,14 +96,18 @@ void MakeSmartPtrCheck::registerMatchers(ast_matchers::MatchFinder *Finder) {
       this);
 
   Finder->addMatcher(
-      traverse(TK_AsIs,
-               cxxMemberCallExpr(
-                   thisPointerType(getSmartPointerTypeMatcher()),
-                   callee(cxxMethodDecl(hasName("reset"))),
-                   hasArgument(0, cxxNewExpr(CanCallCtor, unless(IsPlacement))
-                                      .bind(NewExpression)),
-                   unless(isInTemplateInstantiation()))
-                   .bind(ResetCall)),
+      traverse(
+          TK_AsIs,
+          cxxMemberCallExpr(
+              unless(isInTemplateInstantiation()),
+              hasArgument(0, cxxNewExpr(CanCallCtor, unless(IsPlacement))
+                                 .bind(NewExpression)),
+              callee(cxxMethodDecl(hasName("reset"))),
+              anyOf(thisPointerType(getSmartPointerTypeMatcher()),
+                    on(ignoringImplicit(anyOf(
+                        hasType(getSmartPointerTypeMatcher()),
+                        hasType(pointsTo(getSmartPointerTypeMatcher())))))))
+              .bind(ResetCall)),
       this);
 }
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index ecfb3aa9267f140..f9671a65a26fca3 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -331,6 +331,11 @@ Changes in existing checks
   iterators initialized by free functions like ``begin``, ``end``, or ``size``
   and avoid crash for array of dependent array.
 
+- Improved :doc:`modernize-make-shared
+  <clang-tidy/checks/modernize/make-shared>` check to support
+  ``std::shared_ptr`` implementations that inherit the ``reset`` method from a
+  base class.
+
 - Improved :doc:`modernize-return-braced-init-list
   <clang-tidy/checks/modernize/return-braced-init-list>` check to ignore
   false-positives when constructing the container with ``count`` copies of
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/shared_ptr.h b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/shared_ptr.h
index 0f4f2a97095b56f..337cb28228b09c4 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/shared_ptr.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/shared_ptr.h
@@ -1,24 +1,33 @@
 namespace std {
 
 template <typename type>
-class shared_ptr {
+class __shared_ptr {
+protected:
+  __shared_ptr();
+  __shared_ptr(type *ptr);
+  ~__shared_ptr();
 public:
-  shared_ptr();
-  shared_ptr(type *ptr);
-  shared_ptr(const shared_ptr<type> &t) {}
-  shared_ptr(shared_ptr<type> &&t) {}
-  ~shared_ptr();
   type &operator*() { return *ptr; }
   type *operator->() { return ptr; }
   type *release();
   void reset();
   void reset(type *pt);
-  shared_ptr &operator=(shared_ptr &&);
-  template <typename T>
-  shared_ptr &operator=(shared_ptr<T> &&);
 
 private:
   type *ptr;
 };
 
+template <typename type>
+class shared_ptr : public __shared_ptr<type> {
+public:
+  shared_ptr();
+  shared_ptr(type *ptr);
+  shared_ptr(const shared_ptr<type> &t);
+  shared_ptr(shared_ptr<type> &&t);
+  ~shared_ptr();
+  shared_ptr &operator=(shared_ptr &&);
+  template <typename T>
+  shared_ptr &operator=(shared_ptr<T> &&);
+};
+
 }  // namespace std

>From f6f769203dd3c4da410c9f8d4cc5108a9cb6640a Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 3 Nov 2023 08:35:46 -0700
Subject: [PATCH 39/76] [tests] Autogenerate a couple of tests

As usual, making it easier for an upcoming test delta to be seen.

Note that several of these are examples of extremely bad testing practice.
Checking internal debug output (for no real purpose), and checking the
result of a fully O2 + llc run instead of reducing the specific problematic
pass.
---
 llvm/test/CodeGen/BPF/loop-exit-cond.ll       |  30 +-
 .../HardwareLoops/ARM/fp-emulation.ll         | 295 +++++--
 .../Transforms/IRCE/variable-loop-bounds.ll   | 790 ++++++++++++++++--
 3 files changed, 999 insertions(+), 116 deletions(-)

diff --git a/llvm/test/CodeGen/BPF/loop-exit-cond.ll b/llvm/test/CodeGen/BPF/loop-exit-cond.ll
index dc77ea393c6773e..a134ed1f7ebcd9b 100644
--- a/llvm/test/CodeGen/BPF/loop-exit-cond.ll
+++ b/llvm/test/CodeGen/BPF/loop-exit-cond.ll
@@ -1,5 +1,5 @@
-; RUN: opt -O2 -S -o %t1 < %s
-; RUN: llc -march=bpf -mcpu=v3 %t1 -o - | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -O2 -S < %s | FileCheck %s
 ;
 ; Source code:
 ;   typedef unsigned long u64;
@@ -26,6 +26,29 @@ target triple = "bpf"
 
 ; Function Attrs: nounwind
 define dso_local i32 @test(i32 %len, ptr %data) #0 {
+; CHECK-LABEL: define dso_local i32 @test(
+; CHECK-SAME: i32 [[LEN:%.*]], ptr nocapture readonly [[DATA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D:%.*]] = alloca [1 x i64], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], -2
+; CHECK-NEXT:    [[OR_COND:%.*]] = icmp ult i32 [[TMP0]], 98
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_BODY:%.*]], label [[IF_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[D]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DATA]], align 1, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[NARROW:%.*]] = select i1 [[TOBOOL_NOT]], i8 48, i8 [[TMP1]]
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i8 [[NARROW]] to i64
+; CHECK-NEXT:    store i64 [[CONV2]], ptr [[D]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    call void @foo(ptr nonnull @.str, i32 [[I_05]], ptr nonnull [[D]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[D]]) #[[ATTR3]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_05]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[INC]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY]], label [[IF_END]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %len.addr = alloca i32, align 4
   %data.addr = alloca ptr, align 8
@@ -48,9 +71,6 @@ for.cond:                                         ; preds = %for.inc, %if.then
   %cmp1 = icmp slt i32 %1, %2
   br i1 %cmp1, label %for.body, label %for.cond.cleanup
 
-; CHECK:      w[[LEN:[0-9]+]] = w1
-; CHECK:      w[[IDX:[0-9]+]] += 1
-; CHECK-NEXT: w[[IDX]] s< w[[LEN]] goto
 
 for.cond.cleanup:                                 ; preds = %for.cond
   call void @llvm.lifetime.end.p0(i64 4, ptr %i) #3
diff --git a/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll b/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
index 86f9e1e85b9da56..51fa9158f761918 100644
--- a/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
+++ b/llvm/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
@@ -1,22 +1,74 @@
-; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+fp-armv8 -passes=hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
-; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+soft-float -passes=hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT
-
-; CHECK-LABEL: test_fptosi
-; CHECK-SOFT-NOT: call i32 @llvm.start.loop.iterations
-
-; CHECK: entry:
-; CHECK-FP: [[COUNT:%[^ ]+]] = call i32 @llvm.umax.i32(i32 %n, i32 1)
-
-; CHECK: while.body.lr.ph:
-; CHECK-FP: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
-; CHECK-FP-NEXT: br label %while.body
-
-; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
-; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
-; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+fp-armv8 -passes=hardware-loops %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-FP
+; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+soft-float -passes=hardware-loops %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
 
 define void @test_fptosi(i32 %n, ptr %g, ptr %d) {
+; CHECK-FP-LABEL: define void @test_fptosi(
+; CHECK-FP-SAME: i32 [[N:%.*]], ptr [[G:%.*]], ptr [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-FP-NEXT:  entry:
+; CHECK-FP-NEXT:    [[N_OFF:%.*]] = add i32 [[N]], -1
+; CHECK-FP-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[N_OFF]], 500
+; CHECK-FP-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; CHECK-FP-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY_LR_PH:%.*]], label [[CLEANUP:%.*]]
+; CHECK-FP:       while.body.lr.ph:
+; CHECK-FP-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D]], align 4
+; CHECK-FP-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[G]], align 4
+; CHECK-FP-NEXT:    [[TMP3:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[UMAX]])
+; CHECK-FP-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-FP:       while.body:
+; CHECK-FP-NEXT:    [[I_012:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END4:%.*]] ]
+; CHECK-FP-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY_LR_PH]] ], [ [[TMP6:%.*]], [[IF_END4]] ]
+; CHECK-FP-NEXT:    [[REM:%.*]] = urem i32 [[I_012]], 10
+; CHECK-FP-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-FP-NEXT:    br i1 [[TOBOOL]], label [[IF_END4]], label [[IF_THEN2:%.*]]
+; CHECK-FP:       if.then2:
+; CHECK-FP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 [[I_012]]
+; CHECK-FP-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-FP-NEXT:    [[CONV:%.*]] = fptosi double [[TMP5]] to i32
+; CHECK-FP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[I_012]]
+; CHECK-FP-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX3]], align 4
+; CHECK-FP-NEXT:    br label [[IF_END4]]
+; CHECK-FP:       if.end4:
+; CHECK-FP-NEXT:    [[INC]] = add nuw i32 [[I_012]], 1
+; CHECK-FP-NEXT:    [[TMP6]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1)
+; CHECK-FP-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-FP-NEXT:    br i1 [[TMP7]], label [[WHILE_BODY]], label [[CLEANUP_LOOPEXIT:%.*]]
+; CHECK-FP:       cleanup.loopexit:
+; CHECK-FP-NEXT:    br label [[CLEANUP]]
+; CHECK-FP:       cleanup:
+; CHECK-FP-NEXT:    ret void
+;
+; CHECK-SOFT-LABEL: define void @test_fptosi(
+; CHECK-SOFT-SAME: i32 [[N:%.*]], ptr [[G:%.*]], ptr [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SOFT-NEXT:  entry:
+; CHECK-SOFT-NEXT:    [[N_OFF:%.*]] = add i32 [[N]], -1
+; CHECK-SOFT-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[N_OFF]], 500
+; CHECK-SOFT-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY_LR_PH:%.*]], label [[CLEANUP:%.*]]
+; CHECK-SOFT:       while.body.lr.ph:
+; CHECK-SOFT-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D]], align 4
+; CHECK-SOFT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[G]], align 4
+; CHECK-SOFT-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-SOFT:       while.body:
+; CHECK-SOFT-NEXT:    [[I_012:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END4:%.*]] ]
+; CHECK-SOFT-NEXT:    [[REM:%.*]] = urem i32 [[I_012]], 10
+; CHECK-SOFT-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-SOFT-NEXT:    br i1 [[TOBOOL]], label [[IF_END4]], label [[IF_THEN2:%.*]]
+; CHECK-SOFT:       if.then2:
+; CHECK-SOFT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 [[I_012]]
+; CHECK-SOFT-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-SOFT-NEXT:    [[CONV:%.*]] = fptosi double [[TMP3]] to i32
+; CHECK-SOFT-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[I_012]]
+; CHECK-SOFT-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX3]], align 4
+; CHECK-SOFT-NEXT:    br label [[IF_END4]]
+; CHECK-SOFT:       if.end4:
+; CHECK-SOFT-NEXT:    [[INC]] = add nuw i32 [[I_012]], 1
+; CHECK-SOFT-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[INC]], [[N]]
+; CHECK-SOFT-NEXT:    br i1 [[CMP1]], label [[WHILE_BODY]], label [[CLEANUP_LOOPEXIT:%.*]]
+; CHECK-SOFT:       cleanup.loopexit:
+; CHECK-SOFT-NEXT:    br label [[CLEANUP]]
+; CHECK-SOFT:       cleanup:
+; CHECK-SOFT-NEXT:    ret void
+;
 entry:
   %n.off = add i32 %n, -1
   %0 = icmp ult i32 %n.off, 500
@@ -53,21 +105,73 @@ cleanup:
   ret void
 }
 
-; CHECK-LABEL: test_fptoui
-; CHECK: entry:
-; CHECK-FP: [[COUNT:%[^ ]+]] = call i32 @llvm.umax.i32(i32 %n, i32 1)
-; CHECK-FP: while.body.lr.ph:
-; CHECK-FP: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
-; CHECK-FP-NEXT: br label %while.body
-
-; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
-; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
-; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
-
-; CHECK-SOFT-NOT: call i32 @llvm.start.loop.iterations
-
 define void @test_fptoui(i32 %n, ptr %g, ptr %d) {
+; CHECK-FP-LABEL: define void @test_fptoui(
+; CHECK-FP-SAME: i32 [[N:%.*]], ptr [[G:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-FP-NEXT:  entry:
+; CHECK-FP-NEXT:    [[N_OFF:%.*]] = add i32 [[N]], -1
+; CHECK-FP-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[N_OFF]], 500
+; CHECK-FP-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; CHECK-FP-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY_LR_PH:%.*]], label [[CLEANUP:%.*]]
+; CHECK-FP:       while.body.lr.ph:
+; CHECK-FP-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D]], align 4
+; CHECK-FP-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[G]], align 4
+; CHECK-FP-NEXT:    [[TMP3:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[UMAX]])
+; CHECK-FP-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-FP:       while.body:
+; CHECK-FP-NEXT:    [[I_012:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END4:%.*]] ]
+; CHECK-FP-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY_LR_PH]] ], [ [[TMP6:%.*]], [[IF_END4]] ]
+; CHECK-FP-NEXT:    [[REM:%.*]] = urem i32 [[I_012]], 10
+; CHECK-FP-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-FP-NEXT:    br i1 [[TOBOOL]], label [[IF_END4]], label [[IF_THEN2:%.*]]
+; CHECK-FP:       if.then2:
+; CHECK-FP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 [[I_012]]
+; CHECK-FP-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-FP-NEXT:    [[CONV:%.*]] = fptoui double [[TMP5]] to i32
+; CHECK-FP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[I_012]]
+; CHECK-FP-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX3]], align 4
+; CHECK-FP-NEXT:    br label [[IF_END4]]
+; CHECK-FP:       if.end4:
+; CHECK-FP-NEXT:    [[INC]] = add nuw i32 [[I_012]], 1
+; CHECK-FP-NEXT:    [[TMP6]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1)
+; CHECK-FP-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-FP-NEXT:    br i1 [[TMP7]], label [[WHILE_BODY]], label [[CLEANUP_LOOPEXIT:%.*]]
+; CHECK-FP:       cleanup.loopexit:
+; CHECK-FP-NEXT:    br label [[CLEANUP]]
+; CHECK-FP:       cleanup:
+; CHECK-FP-NEXT:    ret void
+;
+; CHECK-SOFT-LABEL: define void @test_fptoui(
+; CHECK-SOFT-SAME: i32 [[N:%.*]], ptr [[G:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-SOFT-NEXT:  entry:
+; CHECK-SOFT-NEXT:    [[N_OFF:%.*]] = add i32 [[N]], -1
+; CHECK-SOFT-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[N_OFF]], 500
+; CHECK-SOFT-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY_LR_PH:%.*]], label [[CLEANUP:%.*]]
+; CHECK-SOFT:       while.body.lr.ph:
+; CHECK-SOFT-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D]], align 4
+; CHECK-SOFT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[G]], align 4
+; CHECK-SOFT-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-SOFT:       while.body:
+; CHECK-SOFT-NEXT:    [[I_012:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END4:%.*]] ]
+; CHECK-SOFT-NEXT:    [[REM:%.*]] = urem i32 [[I_012]], 10
+; CHECK-SOFT-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-SOFT-NEXT:    br i1 [[TOBOOL]], label [[IF_END4]], label [[IF_THEN2:%.*]]
+; CHECK-SOFT:       if.then2:
+; CHECK-SOFT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 [[I_012]]
+; CHECK-SOFT-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-SOFT-NEXT:    [[CONV:%.*]] = fptoui double [[TMP3]] to i32
+; CHECK-SOFT-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[I_012]]
+; CHECK-SOFT-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX3]], align 4
+; CHECK-SOFT-NEXT:    br label [[IF_END4]]
+; CHECK-SOFT:       if.end4:
+; CHECK-SOFT-NEXT:    [[INC]] = add nuw i32 [[I_012]], 1
+; CHECK-SOFT-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[INC]], [[N]]
+; CHECK-SOFT-NEXT:    br i1 [[CMP1]], label [[WHILE_BODY]], label [[CLEANUP_LOOPEXIT:%.*]]
+; CHECK-SOFT:       cleanup.loopexit:
+; CHECK-SOFT-NEXT:    br label [[CLEANUP]]
+; CHECK-SOFT:       cleanup:
+; CHECK-SOFT-NEXT:    ret void
+;
 entry:
   %n.off = add i32 %n, -1
   %0 = icmp ult i32 %n.off, 500
@@ -104,19 +208,41 @@ cleanup:
   ret void
 }
 
-; CHECK-LABEL: load_store_float
-; CHECK: entry:
-; CHECK:   [[COUNT:%[^ ]+]] = call i32 @llvm.umax.i32(i32 %n, i32 1)
-; CHECK: while.body.lr.ph:
-; CHECK:   [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
-; CHECK-NEXT: br label %while.body
-
-; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
-; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
-; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
-
 define void @load_store_float(i32 %n, ptr %d, ptr %g) {
+; CHECK-LABEL: define void @load_store_float(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[D:%.*]], ptr [[G:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N_OFF:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[N_OFF]], 500
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; CHECK-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY_LR_PH:%.*]], label [[CLEANUP:%.*]]
+; CHECK:       while.body.lr.ph:
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[G]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[UMAX]])
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[I_012:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END4:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY_LR_PH]] ], [ [[TMP6:%.*]], [[IF_END4]] ]
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[I_012]], 10
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END4]], label [[IF_THEN2:%.*]]
+; CHECK:       if.then2:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 [[I_012]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 [[I_012]]
+; CHECK-NEXT:    store double [[TMP5]], ptr [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    br label [[IF_END4]]
+; CHECK:       if.end4:
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_012]], 1
+; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[WHILE_BODY]], label [[CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       cleanup.loopexit:
+; CHECK-NEXT:    br label [[CLEANUP]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    ret void
+;
 entry:
   %n.off = add i32 %n, -1
   %0 = icmp ult i32 %n.off, 500
@@ -152,22 +278,75 @@ cleanup:
   ret void
 }
 
-; CHECK-LABEL: fp_add
-; CHECK-SOFT-NOT: call i32 @llvm.start.loop.iterations
-; CHECK: entry:
-; CHECK-FP: [[COUNT:%[^ ]+]] = call i32 @llvm.umax.i32(i32 %n, i32 1)
-; CHECK: while.body.lr.ph:
-; CHECK-FP: [[START:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 [[COUNT]])
-; CHECK: br label %while.body
-
-; CHECK-SOFT-NOT: call i32 @llvm.loop.decrement
-
-; CHECK-FP: [[REM:%[^ ]+]] = phi i32 [ [[START]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
-; CHECK-FP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
-; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
-
 define void @fp_add(i32 %n, ptr %d, ptr %g) {
+; CHECK-FP-LABEL: define void @fp_add(
+; CHECK-FP-SAME: i32 [[N:%.*]], ptr [[D:%.*]], ptr [[G:%.*]]) #[[ATTR0]] {
+; CHECK-FP-NEXT:  entry:
+; CHECK-FP-NEXT:    [[N_OFF:%.*]] = add i32 [[N]], -1
+; CHECK-FP-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[N_OFF]], 500
+; CHECK-FP-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; CHECK-FP-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY_LR_PH:%.*]], label [[CLEANUP:%.*]]
+; CHECK-FP:       while.body.lr.ph:
+; CHECK-FP-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D]], align 4
+; CHECK-FP-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[G]], align 4
+; CHECK-FP-NEXT:    [[TMP3:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[UMAX]])
+; CHECK-FP-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-FP:       while.body:
+; CHECK-FP-NEXT:    [[I_012:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END4:%.*]] ]
+; CHECK-FP-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY_LR_PH]] ], [ [[TMP7:%.*]], [[IF_END4]] ]
+; CHECK-FP-NEXT:    [[REM:%.*]] = urem i32 [[I_012]], 10
+; CHECK-FP-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-FP-NEXT:    br i1 [[TOBOOL]], label [[IF_END4]], label [[IF_THEN2:%.*]]
+; CHECK-FP:       if.then2:
+; CHECK-FP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[I_012]]
+; CHECK-FP-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-FP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 [[I_012]]
+; CHECK-FP-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; CHECK-FP-NEXT:    [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]]
+; CHECK-FP-NEXT:    store float [[ADD]], ptr [[ARRAYIDX3]], align 4
+; CHECK-FP-NEXT:    br label [[IF_END4]]
+; CHECK-FP:       if.end4:
+; CHECK-FP-NEXT:    [[INC]] = add nuw i32 [[I_012]], 1
+; CHECK-FP-NEXT:    [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1)
+; CHECK-FP-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-FP-NEXT:    br i1 [[TMP8]], label [[WHILE_BODY]], label [[CLEANUP_LOOPEXIT:%.*]]
+; CHECK-FP:       cleanup.loopexit:
+; CHECK-FP-NEXT:    br label [[CLEANUP]]
+; CHECK-FP:       cleanup:
+; CHECK-FP-NEXT:    ret void
+;
+; CHECK-SOFT-LABEL: define void @fp_add(
+; CHECK-SOFT-SAME: i32 [[N:%.*]], ptr [[D:%.*]], ptr [[G:%.*]]) #[[ATTR0]] {
+; CHECK-SOFT-NEXT:  entry:
+; CHECK-SOFT-NEXT:    [[N_OFF:%.*]] = add i32 [[N]], -1
+; CHECK-SOFT-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[N_OFF]], 500
+; CHECK-SOFT-NEXT:    br i1 [[TMP0]], label [[WHILE_BODY_LR_PH:%.*]], label [[CLEANUP:%.*]]
+; CHECK-SOFT:       while.body.lr.ph:
+; CHECK-SOFT-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D]], align 4
+; CHECK-SOFT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[G]], align 4
+; CHECK-SOFT-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-SOFT:       while.body:
+; CHECK-SOFT-NEXT:    [[I_012:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END4:%.*]] ]
+; CHECK-SOFT-NEXT:    [[REM:%.*]] = urem i32 [[I_012]], 10
+; CHECK-SOFT-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-SOFT-NEXT:    br i1 [[TOBOOL]], label [[IF_END4]], label [[IF_THEN2:%.*]]
+; CHECK-SOFT:       if.then2:
+; CHECK-SOFT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 [[I_012]]
+; CHECK-SOFT-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-SOFT-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 [[I_012]]
+; CHECK-SOFT-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; CHECK-SOFT-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]]
+; CHECK-SOFT-NEXT:    store float [[ADD]], ptr [[ARRAYIDX3]], align 4
+; CHECK-SOFT-NEXT:    br label [[IF_END4]]
+; CHECK-SOFT:       if.end4:
+; CHECK-SOFT-NEXT:    [[INC]] = add nuw i32 [[I_012]], 1
+; CHECK-SOFT-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[INC]], [[N]]
+; CHECK-SOFT-NEXT:    br i1 [[CMP1]], label [[WHILE_BODY]], label [[CLEANUP_LOOPEXIT:%.*]]
+; CHECK-SOFT:       cleanup.loopexit:
+; CHECK-SOFT-NEXT:    br label [[CLEANUP]]
+; CHECK-SOFT:       cleanup:
+; CHECK-SOFT-NEXT:    ret void
+;
 entry:
   %n.off = add i32 %n, -1
   %0 = icmp ult i32 %n.off, 500
diff --git a/llvm/test/Transforms/IRCE/variable-loop-bounds.ll b/llvm/test/Transforms/IRCE/variable-loop-bounds.ll
index 81946f1c857e53b..43d450b938afea4 100644
--- a/llvm/test/Transforms/IRCE/variable-loop-bounds.ll
+++ b/llvm/test/Transforms/IRCE/variable-loop-bounds.ll
@@ -1,23 +1,85 @@
-; RUN: opt -passes=irce -S -verify-loop-info -irce-print-changed-loops -irce-skip-profitability-checks < %s 2>&1 | FileCheck %s
-
-; CHECK: irce: in function test_inc_eq: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%if.then,%for.inc<latch><exiting>
-; CHECK: irce: in function test_inc_ne: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%if.then,%for.inc<latch><exiting>
-; CHECK: irce: in function test_inc_slt: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%if.then,%for.inc<latch><exiting>
-; CHECK: irce: in function test_inc_ult: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%if.then,%for.inc<latch><exiting>
-; CHECK: irce: in function signed_var_imm_dec_sgt: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%for.inc<latch><exiting>
-; CHECK-NOT: irce: in function signed_var_imm_dec_slt: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%for.inc<latch><exiting>
-; CHECK: irce: in function signed_var_imm_dec_sge: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%for.inc<latch><exiting>
-; CHECK: irce: in function signed_var_imm_dec_ne: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%for.inc<latch><exiting>
-; CHECK-NOT: irce: in function signed_var_imm_dec_eq: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%for.inc<latch><exiting>
-; CHECK-NOT: irce: in function test_dec_bound_with_smaller_start_than_bound: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%for.dec<latch><exiting>
-; CHECK-NOT: irce: in function test_inc_bound_with_bigger_start_than_bound: constrained Loop at depth 1 containing: %for.body<header>,%if.else,%for.dec<latch><exiting>
-
-; CHECK-LABEL: test_inc_eq(
-; CHECK: main.exit.selector:
-; CHECK: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %inc, %for.inc ]
-; CHECK: [[COND:%[^ ]+]] = icmp ult i32 [[PSEUDO_PHI]], %N
-; CHECK: br i1 [[COND]], label %main.pseudo.exit, label %for.cond.cleanup.loopexit
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=irce -S -verify-loop-info -irce-skip-profitability-checks < %s 2>&1 | FileCheck %s
+
 define void @test_inc_eq(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %N) {
+; CHECK-LABEL: define void @test_inc_eq(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP16]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.umin.i32(i32 [[N]], i32 512)
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 0, [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_BODY_PREHEADER1:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preheader1:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_017:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER1]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[I_017]], 512
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUB]], [[TMP3]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017]]
+; CHECK-NEXT:    store i32 [[ADD6]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_017]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[INC]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP4]], true
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MAIN_EXIT_SELECTOR:%.*]], label [[FOR_BODY]]
+; CHECK:       main.exit.selector:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[INC_LCSSA]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MAIN_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    [[I_017_COPY:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[POSTLOOP:%.*]]
+; CHECK:       postloop:
+; CHECK-NEXT:    br label [[FOR_BODY_POSTLOOP:%.*]]
+; CHECK:       for.body.postloop:
+; CHECK-NEXT:    [[I_017_POSTLOOP:%.*]] = phi i32 [ [[INC_POSTLOOP:%.*]], [[FOR_INC_POSTLOOP:%.*]] ], [ [[I_017_COPY]], [[POSTLOOP]] ]
+; CHECK-NEXT:    [[CMP1_POSTLOOP:%.*]] = icmp ult i32 [[I_017_POSTLOOP]], 512
+; CHECK-NEXT:    [[ARRAYIDX_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX2_POSTLOOP]], align 4
+; CHECK-NEXT:    br i1 [[CMP1_POSTLOOP]], label [[IF_THEN_POSTLOOP:%.*]], label [[IF_ELSE_POSTLOOP:%.*]]
+; CHECK:       if.else.postloop:
+; CHECK-NEXT:    [[ADD6_POSTLOOP:%.*]] = add nsw i32 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX7_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    store i32 [[ADD6_POSTLOOP]], ptr [[ARRAYIDX7_POSTLOOP]], align 4
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       if.then.postloop:
+; CHECK-NEXT:    [[SUB_POSTLOOP:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX3_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ADD_POSTLOOP:%.*]] = add nsw i32 [[SUB_POSTLOOP]], [[TMP9]]
+; CHECK-NEXT:    store i32 [[ADD_POSTLOOP]], ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       for.inc.postloop:
+; CHECK-NEXT:    [[INC_POSTLOOP]] = add nuw nsw i32 [[I_017_POSTLOOP]], 1
+; CHECK-NEXT:    [[EXITCOND_POSTLOOP:%.*]] = icmp eq i32 [[INC_POSTLOOP]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_POSTLOOP]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY_POSTLOOP]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone !5
+;
 entry:
   %cmp16 = icmp sgt i32 %N, 0
   br i1 %cmp16, label %for.body, label %for.cond.cleanup
@@ -54,12 +116,84 @@ for.inc:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: test_inc_ne
-; CHECK: main.exit.selector:
-; CHECK: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %inc, %for.inc ]
-; CHECK: [[COND:%[^ ]+]] = icmp ult i32 [[PSEUDO_PHI]], %N
-; CHECK: br i1 [[COND]], label %main.pseudo.exit, label %for.cond.cleanup.loopexit
 define void @test_inc_ne(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %N) {
+; CHECK-LABEL: define void @test_inc_ne(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP16]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.umin.i32(i32 [[N]], i32 512)
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 0, [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_BODY_PREHEADER1:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preheader1:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_017:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER1]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[I_017]], 512
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUB]], [[TMP3]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017]]
+; CHECK-NEXT:    store i32 [[ADD6]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_017]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[N]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[INC]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK:       main.exit.selector:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[INC_LCSSA]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MAIN_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    [[I_017_COPY:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[POSTLOOP:%.*]]
+; CHECK:       postloop:
+; CHECK-NEXT:    br label [[FOR_BODY_POSTLOOP:%.*]]
+; CHECK:       for.body.postloop:
+; CHECK-NEXT:    [[I_017_POSTLOOP:%.*]] = phi i32 [ [[INC_POSTLOOP:%.*]], [[FOR_INC_POSTLOOP:%.*]] ], [ [[I_017_COPY]], [[POSTLOOP]] ]
+; CHECK-NEXT:    [[CMP1_POSTLOOP:%.*]] = icmp ult i32 [[I_017_POSTLOOP]], 512
+; CHECK-NEXT:    [[ARRAYIDX_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX2_POSTLOOP]], align 4
+; CHECK-NEXT:    br i1 [[CMP1_POSTLOOP]], label [[IF_THEN_POSTLOOP:%.*]], label [[IF_ELSE_POSTLOOP:%.*]]
+; CHECK:       if.else.postloop:
+; CHECK-NEXT:    [[ADD6_POSTLOOP:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ARRAYIDX7_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    store i32 [[ADD6_POSTLOOP]], ptr [[ARRAYIDX7_POSTLOOP]], align 4
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       if.then.postloop:
+; CHECK-NEXT:    [[SUB_POSTLOOP:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX3_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ADD_POSTLOOP:%.*]] = add nsw i32 [[SUB_POSTLOOP]], [[TMP8]]
+; CHECK-NEXT:    store i32 [[ADD_POSTLOOP]], ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       for.inc.postloop:
+; CHECK-NEXT:    [[INC_POSTLOOP]] = add nuw nsw i32 [[I_017_POSTLOOP]], 1
+; CHECK-NEXT:    [[EXITCOND_POSTLOOP:%.*]] = icmp ne i32 [[INC_POSTLOOP]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_POSTLOOP]], label [[FOR_BODY_POSTLOOP]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]], !loop_constrainer.loop.clone !5
+;
 entry:
   %cmp16 = icmp sgt i32 %N, 0
   br i1 %cmp16, label %for.body, label %for.cond.cleanup
@@ -96,12 +230,85 @@ for.inc:
   br i1 %exitcond, label %for.body, label %for.cond.cleanup
 }
 
-; CHECK-LABEL: test_inc_slt(
-; CHECK: main.exit.selector:
-; CHECK: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %inc, %for.inc ]
-; CHECK: [[COND:%[^ ]+]] = icmp slt i32 [[PSEUDO_PHI]], %N
-; CHECK: br i1 [[COND]], label %main.pseudo.exit, label %for.cond.cleanup.loopexit
 define void @test_inc_slt(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %N) {
+; CHECK-LABEL: define void @test_inc_slt(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP16]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[N]], i32 512)
+; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN]], i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i32 0, [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_BODY_PREHEADER1:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preheader1:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_017:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER1]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[I_017]], 512
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUB]], [[TMP3]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017]]
+; CHECK-NEXT:    store i32 [[ADD6]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_017]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[INC]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK:       main.exit.selector:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[INC_LCSSA]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MAIN_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    [[I_017_COPY:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[POSTLOOP:%.*]]
+; CHECK:       postloop:
+; CHECK-NEXT:    br label [[FOR_BODY_POSTLOOP:%.*]]
+; CHECK:       for.body.postloop:
+; CHECK-NEXT:    [[I_017_POSTLOOP:%.*]] = phi i32 [ [[INC_POSTLOOP:%.*]], [[FOR_INC_POSTLOOP:%.*]] ], [ [[I_017_COPY]], [[POSTLOOP]] ]
+; CHECK-NEXT:    [[CMP1_POSTLOOP:%.*]] = icmp ult i32 [[I_017_POSTLOOP]], 512
+; CHECK-NEXT:    [[ARRAYIDX_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX2_POSTLOOP]], align 4
+; CHECK-NEXT:    br i1 [[CMP1_POSTLOOP]], label [[IF_THEN_POSTLOOP:%.*]], label [[IF_ELSE_POSTLOOP:%.*]]
+; CHECK:       if.else.postloop:
+; CHECK-NEXT:    [[ADD6_POSTLOOP:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ARRAYIDX7_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    store i32 [[ADD6_POSTLOOP]], ptr [[ARRAYIDX7_POSTLOOP]], align 4
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       if.then.postloop:
+; CHECK-NEXT:    [[SUB_POSTLOOP:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX3_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ADD_POSTLOOP:%.*]] = add nsw i32 [[SUB_POSTLOOP]], [[TMP8]]
+; CHECK-NEXT:    store i32 [[ADD_POSTLOOP]], ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       for.inc.postloop:
+; CHECK-NEXT:    [[INC_POSTLOOP]] = add nuw nsw i32 [[I_017_POSTLOOP]], 1
+; CHECK-NEXT:    [[EXITCOND_POSTLOOP:%.*]] = icmp slt i32 [[INC_POSTLOOP]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_POSTLOOP]], label [[FOR_BODY_POSTLOOP]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone !5
+;
 entry:
   %cmp16 = icmp sgt i32 %N, 0
   br i1 %cmp16, label %for.body, label %for.cond.cleanup
@@ -138,12 +345,84 @@ for.inc:
   br i1 %exitcond, label %for.body, label %for.cond.cleanup
 }
 
-; CHECK-LABEL: test_inc_ult
-; CHECK: main.exit.selector:
-; CHECK: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %inc, %for.inc ]
-; CHECK: [[COND:%[^ ]+]] = icmp ult i32 [[PSEUDO_PHI]], %N
-; CHECK: br i1 [[COND]], label %main.pseudo.exit, label %for.cond.cleanup.loopexit
 define void @test_inc_ult(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %N) {
+; CHECK-LABEL: define void @test_inc_ult(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP16:%.*]] = icmp ugt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP16]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.umin.i32(i32 [[N]], i32 512)
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 0, [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_BODY_PREHEADER1:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preheader1:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_017:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER1]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[I_017]], 512
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUB]], [[TMP3]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017]]
+; CHECK-NEXT:    store i32 [[ADD6]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_017]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ult i32 [[INC]], [[N]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[INC]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK:       main.exit.selector:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[INC_LCSSA]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MAIN_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    [[I_017_COPY:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[POSTLOOP:%.*]]
+; CHECK:       postloop:
+; CHECK-NEXT:    br label [[FOR_BODY_POSTLOOP:%.*]]
+; CHECK:       for.body.postloop:
+; CHECK-NEXT:    [[I_017_POSTLOOP:%.*]] = phi i32 [ [[INC_POSTLOOP:%.*]], [[FOR_INC_POSTLOOP:%.*]] ], [ [[I_017_COPY]], [[POSTLOOP]] ]
+; CHECK-NEXT:    [[CMP1_POSTLOOP:%.*]] = icmp ult i32 [[I_017_POSTLOOP]], 512
+; CHECK-NEXT:    [[ARRAYIDX_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX2_POSTLOOP]], align 4
+; CHECK-NEXT:    br i1 [[CMP1_POSTLOOP]], label [[IF_THEN_POSTLOOP:%.*]], label [[IF_ELSE_POSTLOOP:%.*]]
+; CHECK:       if.else.postloop:
+; CHECK-NEXT:    [[ADD6_POSTLOOP:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ARRAYIDX7_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    store i32 [[ADD6_POSTLOOP]], ptr [[ARRAYIDX7_POSTLOOP]], align 4
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       if.then.postloop:
+; CHECK-NEXT:    [[SUB_POSTLOOP:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX3_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_017_POSTLOOP]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ADD_POSTLOOP:%.*]] = add nsw i32 [[SUB_POSTLOOP]], [[TMP8]]
+; CHECK-NEXT:    store i32 [[ADD_POSTLOOP]], ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       for.inc.postloop:
+; CHECK-NEXT:    [[INC_POSTLOOP]] = add nuw nsw i32 [[I_017_POSTLOOP]], 1
+; CHECK-NEXT:    [[EXITCOND_POSTLOOP:%.*]] = icmp ult i32 [[INC_POSTLOOP]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_POSTLOOP]], label [[FOR_BODY_POSTLOOP]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone !5
+;
 entry:
   %cmp16 = icmp ugt i32 %N, 0
   br i1 %cmp16, label %for.body, label %for.cond.cleanup
@@ -180,12 +459,114 @@ for.inc:
   br i1 %exitcond, label %for.body, label %for.cond.cleanup
 }
 
-; CHECK-LABEL: signed_var_imm_dec_sgt(
-; CHECK: main.exit.selector:
-; CHECK: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %dec, %for.inc ]
-; CHECK: [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], %M
-; CHECK: br i1 [[COND]]
 define void @signed_var_imm_dec_sgt(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %M) {
+; CHECK-LABEL: define void @signed_var_imm_dec_sgt(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[C:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp slt i32 [[M]], 1024
+; CHECK-NEXT:    br i1 [[CMP14]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 1024)
+; CHECK-NEXT:    [[EXIT_PRELOOP_AT:%.*]] = add nsw i32 [[SMAX]], -1
+; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 0)
+; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = add nsw i32 [[SMAX1]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 1024, [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_BODY_PRELOOP_PREHEADER:%.*]], label [[PRELOOP_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preloop.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY_PRELOOP:%.*]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       mainloop:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[INDVAR_END:%.*]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY_PREHEADER3:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preheader3:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_INC:%.*]] ], [ [[IV_PRELOOP_COPY:%.*]], [[FOR_BODY_PREHEADER3]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[IV]], 1024
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    br i1 true, label [[FOR_INC]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[MUL]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[ADD]], [[IF_ELSE]] ], [ [[MUL]], [[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[IV]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[DEC]], [[M]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[DEC]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_BODY]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK:       main.exit.selector:
+; CHECK-NEXT:    [[DEC_LCSSA:%.*]] = phi i32 [ [[DEC]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[DEC_LCSSA]], [[M]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MAIN_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    [[IV_COPY:%.*]] = phi i32 [ [[IV_PRELOOP_COPY]], [[MAINLOOP:%.*]] ], [ [[DEC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END2:%.*]] = phi i32 [ [[INDVAR_END]], [[MAINLOOP]] ], [ [[DEC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[POSTLOOP:%.*]]
+; CHECK:       for.body.preloop:
+; CHECK-NEXT:    [[IV_PRELOOP:%.*]] = phi i32 [ [[DEC_PRELOOP:%.*]], [[FOR_INC_PRELOOP:%.*]] ], [ 1024, [[FOR_BODY_PRELOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1_PRELOOP:%.*]] = icmp slt i32 [[IV_PRELOOP]], 1024
+; CHECK-NEXT:    [[ARRAYIDX_PRELOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV_PRELOOP]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX_PRELOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_PRELOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV_PRELOOP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX2_PRELOOP]], align 4
+; CHECK-NEXT:    [[MUL_PRELOOP:%.*]] = mul nsw i32 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX3_PRELOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV_PRELOOP]]
+; CHECK-NEXT:    br i1 [[CMP1_PRELOOP]], label [[FOR_INC_PRELOOP]], label [[IF_ELSE_PRELOOP:%.*]]
+; CHECK:       if.else.preloop:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX3_PRELOOP]], align 4
+; CHECK-NEXT:    [[ADD_PRELOOP:%.*]] = add nsw i32 [[TMP10]], [[MUL_PRELOOP]]
+; CHECK-NEXT:    br label [[FOR_INC_PRELOOP]]
+; CHECK:       for.inc.preloop:
+; CHECK-NEXT:    [[STOREMERGE_PRELOOP:%.*]] = phi i32 [ [[ADD_PRELOOP]], [[IF_ELSE_PRELOOP]] ], [ [[MUL_PRELOOP]], [[FOR_BODY_PRELOOP]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE_PRELOOP]], ptr [[ARRAYIDX3_PRELOOP]], align 4
+; CHECK-NEXT:    [[DEC_PRELOOP]] = add nsw i32 [[IV_PRELOOP]], -1
+; CHECK-NEXT:    [[CMP_PRELOOP:%.*]] = icmp sgt i32 [[DEC_PRELOOP]], [[M]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[DEC_PRELOOP]], [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[FOR_BODY_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK:       preloop.exit.selector:
+; CHECK-NEXT:    [[DEC_PRELOOP_LCSSA:%.*]] = phi i32 [ [[DEC_PRELOOP]], [[FOR_INC_PRELOOP]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[DEC_PRELOOP_LCSSA]], [[M]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRELOOP_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       preloop.pseudo.exit:
+; CHECK-NEXT:    [[IV_PRELOOP_COPY]] = phi i32 [ 1024, [[FOR_BODY_PREHEADER]] ], [ [[DEC_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END]] = phi i32 [ 1024, [[FOR_BODY_PREHEADER]] ], [ [[DEC_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[MAINLOOP]]
+; CHECK:       postloop:
+; CHECK-NEXT:    br label [[FOR_BODY_POSTLOOP:%.*]]
+; CHECK:       for.body.postloop:
+; CHECK-NEXT:    [[IV_POSTLOOP:%.*]] = phi i32 [ [[DEC_POSTLOOP:%.*]], [[FOR_INC_POSTLOOP:%.*]] ], [ [[IV_COPY]], [[POSTLOOP]] ]
+; CHECK-NEXT:    [[CMP1_POSTLOOP:%.*]] = icmp slt i32 [[IV_POSTLOOP]], 1024
+; CHECK-NEXT:    [[ARRAYIDX_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV_POSTLOOP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV_POSTLOOP]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX2_POSTLOOP]], align 4
+; CHECK-NEXT:    [[MUL_POSTLOOP:%.*]] = mul nsw i32 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX3_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV_POSTLOOP]]
+; CHECK-NEXT:    br i1 [[CMP1_POSTLOOP]], label [[FOR_INC_POSTLOOP]], label [[IF_ELSE_POSTLOOP:%.*]]
+; CHECK:       if.else.postloop:
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ADD_POSTLOOP:%.*]] = add nsw i32 [[TMP15]], [[MUL_POSTLOOP]]
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       for.inc.postloop:
+; CHECK-NEXT:    [[STOREMERGE_POSTLOOP:%.*]] = phi i32 [ [[ADD_POSTLOOP]], [[IF_ELSE_POSTLOOP]] ], [ [[MUL_POSTLOOP]], [[FOR_BODY_POSTLOOP]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE_POSTLOOP]], ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[DEC_POSTLOOP]] = add nsw i32 [[IV_POSTLOOP]], -1
+; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp sgt i32 [[DEC_POSTLOOP]], [[M]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[FOR_BODY_POSTLOOP]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone !5
+;
 entry:
   %cmp14 = icmp slt i32 %M, 1024
   br i1 %cmp14, label %for.body, label %for.cond.cleanup
@@ -217,12 +598,116 @@ for.inc:                                          ; preds = %for.body, %if.else
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
-; CHECK-LABEL: signed_var_imm_dec_sge(
-; CHECK: main.exit.selector:          ; preds = %for.inc
-; CHECK: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %iv, %for.inc ]
-; CHECK: [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], %M
-; CHECK: br i1 [[COND]]
 define void @signed_var_imm_dec_sge(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %M) {
+; CHECK-LABEL: define void @signed_var_imm_dec_sge(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[C:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp sgt i32 [[M]], 1024
+; CHECK-NEXT:    br i1 [[CMP14]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 1025)
+; CHECK-NEXT:    [[EXIT_PRELOOP_AT:%.*]] = add nsw i32 [[SMAX]], -1
+; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 1)
+; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = add nsw i32 [[SMAX1]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 1025, [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_BODY_PRELOOP_PREHEADER:%.*]], label [[PRELOOP_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preloop.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY_PRELOOP:%.*]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       mainloop:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[INDVAR_END:%.*]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY_PREHEADER3:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preheader3:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_INC:%.*]] ], [ [[IV_PRELOOP_COPY:%.*]], [[FOR_BODY_PREHEADER3]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[IV]], 1024
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    br i1 true, label [[FOR_INC]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[MUL]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[ADD]], [[IF_ELSE]] ], [ [[MUL]], [[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[IV]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[IV]], [[M]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[IV]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_BODY]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK:       main.exit.selector:
+; CHECK-NEXT:    [[DEC_LCSSA:%.*]] = phi i32 [ [[DEC]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[IV_LCSSA]], [[M]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MAIN_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    [[IV_COPY:%.*]] = phi i32 [ [[IV_PRELOOP_COPY]], [[MAINLOOP:%.*]] ], [ [[DEC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END2:%.*]] = phi i32 [ [[INDVAR_END]], [[MAINLOOP]] ], [ [[IV_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[POSTLOOP:%.*]]
+; CHECK:       for.body.preloop:
+; CHECK-NEXT:    [[IV_PRELOOP:%.*]] = phi i32 [ [[DEC_PRELOOP:%.*]], [[FOR_INC_PRELOOP:%.*]] ], [ 1024, [[FOR_BODY_PRELOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1_PRELOOP:%.*]] = icmp slt i32 [[IV_PRELOOP]], 1024
+; CHECK-NEXT:    [[ARRAYIDX_PRELOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV_PRELOOP]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX_PRELOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_PRELOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV_PRELOOP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX2_PRELOOP]], align 4
+; CHECK-NEXT:    [[MUL_PRELOOP:%.*]] = mul nsw i32 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX3_PRELOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV_PRELOOP]]
+; CHECK-NEXT:    br i1 [[CMP1_PRELOOP]], label [[FOR_INC_PRELOOP]], label [[IF_ELSE_PRELOOP:%.*]]
+; CHECK:       if.else.preloop:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX3_PRELOOP]], align 4
+; CHECK-NEXT:    [[ADD_PRELOOP:%.*]] = add nsw i32 [[TMP10]], [[MUL_PRELOOP]]
+; CHECK-NEXT:    br label [[FOR_INC_PRELOOP]]
+; CHECK:       for.inc.preloop:
+; CHECK-NEXT:    [[STOREMERGE_PRELOOP:%.*]] = phi i32 [ [[ADD_PRELOOP]], [[IF_ELSE_PRELOOP]] ], [ [[MUL_PRELOOP]], [[FOR_BODY_PRELOOP]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE_PRELOOP]], ptr [[ARRAYIDX3_PRELOOP]], align 4
+; CHECK-NEXT:    [[DEC_PRELOOP]] = add nsw i32 [[IV_PRELOOP]], -1
+; CHECK-NEXT:    [[CMP_PRELOOP:%.*]] = icmp sgt i32 [[IV_PRELOOP]], [[M]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[IV_PRELOOP]], [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[FOR_BODY_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK:       preloop.exit.selector:
+; CHECK-NEXT:    [[DEC_PRELOOP_LCSSA:%.*]] = phi i32 [ [[DEC_PRELOOP]], [[FOR_INC_PRELOOP]] ]
+; CHECK-NEXT:    [[IV_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IV_PRELOOP]], [[FOR_INC_PRELOOP]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[IV_PRELOOP_LCSSA]], [[M]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRELOOP_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       preloop.pseudo.exit:
+; CHECK-NEXT:    [[IV_PRELOOP_COPY]] = phi i32 [ 1024, [[FOR_BODY_PREHEADER]] ], [ [[DEC_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END]] = phi i32 [ 1025, [[FOR_BODY_PREHEADER]] ], [ [[IV_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[MAINLOOP]]
+; CHECK:       postloop:
+; CHECK-NEXT:    br label [[FOR_BODY_POSTLOOP:%.*]]
+; CHECK:       for.body.postloop:
+; CHECK-NEXT:    [[IV_POSTLOOP:%.*]] = phi i32 [ [[DEC_POSTLOOP:%.*]], [[FOR_INC_POSTLOOP:%.*]] ], [ [[IV_COPY]], [[POSTLOOP]] ]
+; CHECK-NEXT:    [[CMP1_POSTLOOP:%.*]] = icmp slt i32 [[IV_POSTLOOP]], 1024
+; CHECK-NEXT:    [[ARRAYIDX_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV_POSTLOOP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV_POSTLOOP]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX2_POSTLOOP]], align 4
+; CHECK-NEXT:    [[MUL_POSTLOOP:%.*]] = mul nsw i32 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX3_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV_POSTLOOP]]
+; CHECK-NEXT:    br i1 [[CMP1_POSTLOOP]], label [[FOR_INC_POSTLOOP]], label [[IF_ELSE_POSTLOOP:%.*]]
+; CHECK:       if.else.postloop:
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ADD_POSTLOOP:%.*]] = add nsw i32 [[TMP15]], [[MUL_POSTLOOP]]
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       for.inc.postloop:
+; CHECK-NEXT:    [[STOREMERGE_POSTLOOP:%.*]] = phi i32 [ [[ADD_POSTLOOP]], [[IF_ELSE_POSTLOOP]] ], [ [[MUL_POSTLOOP]], [[FOR_BODY_POSTLOOP]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE_POSTLOOP]], ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[DEC_POSTLOOP]] = add nsw i32 [[IV_POSTLOOP]], -1
+; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp sgt i32 [[IV_POSTLOOP]], [[M]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[FOR_BODY_POSTLOOP]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone !5
+;
 entry:
   %cmp14 = icmp sgt i32 %M, 1024
   br i1 %cmp14, label %for.cond.cleanup, label %for.body
@@ -255,6 +740,38 @@ for.inc:                                          ; preds = %for.body, %if.else
 }
 
 define void @signed_var_imm_dec_slt(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %M) {
+; CHECK-LABEL: define void @signed_var_imm_dec_slt(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[C:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp sgt i32 [[M]], 1024
+; CHECK-NEXT:    br i1 [[CMP14]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_INC:%.*]] ], [ 1024, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[IV]], 1024
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_INC]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[MUL]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[ADD]], [[IF_ELSE]] ], [ [[MUL]], [[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[IV]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], [[M]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
 entry:
   %cmp14 = icmp sgt i32 %M, 1024
   br i1 %cmp14, label %for.cond.cleanup, label %for.body
@@ -286,12 +803,114 @@ for.inc:                                          ; preds = %for.body, %if.else
   br i1 %cmp, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: signed_var_imm_dec_ne(
-; CHECK: main.exit.selector:          ; preds = %for.inc
-; CHECK: [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %dec, %for.inc ]
-; CHECK: [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], %M
-; CHECK: br i1 [[COND]]
 define void @signed_var_imm_dec_ne(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %M) {
+; CHECK-LABEL: define void @signed_var_imm_dec_ne(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[C:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp slt i32 [[M]], 1024
+; CHECK-NEXT:    br i1 [[CMP14]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 1024)
+; CHECK-NEXT:    [[EXIT_PRELOOP_AT:%.*]] = add nsw i32 [[SMAX]], -1
+; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 0)
+; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = add nsw i32 [[SMAX1]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 1024, [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_BODY_PRELOOP_PREHEADER:%.*]], label [[PRELOOP_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preloop.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY_PRELOOP:%.*]]
+; CHECK:       for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       mainloop:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[INDVAR_END:%.*]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY_PREHEADER3:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK:       for.body.preheader3:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_INC:%.*]] ], [ [[IV_PRELOOP_COPY:%.*]], [[FOR_BODY_PREHEADER3]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[IV]], 1024
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    br i1 true, label [[FOR_INC]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[MUL]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[ADD]], [[IF_ELSE]] ], [ [[MUL]], [[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[IV]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[DEC]], [[M]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[DEC]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_BODY]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK:       main.exit.selector:
+; CHECK-NEXT:    [[DEC_LCSSA:%.*]] = phi i32 [ [[DEC]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[DEC_LCSSA]], [[M]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MAIN_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    [[IV_COPY:%.*]] = phi i32 [ [[IV_PRELOOP_COPY]], [[MAINLOOP:%.*]] ], [ [[DEC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END2:%.*]] = phi i32 [ [[INDVAR_END]], [[MAINLOOP]] ], [ [[DEC_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[POSTLOOP:%.*]]
+; CHECK:       for.body.preloop:
+; CHECK-NEXT:    [[IV_PRELOOP:%.*]] = phi i32 [ [[DEC_PRELOOP:%.*]], [[FOR_INC_PRELOOP:%.*]] ], [ 1024, [[FOR_BODY_PRELOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1_PRELOOP:%.*]] = icmp slt i32 [[IV_PRELOOP]], 1024
+; CHECK-NEXT:    [[ARRAYIDX_PRELOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV_PRELOOP]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX_PRELOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_PRELOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV_PRELOOP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX2_PRELOOP]], align 4
+; CHECK-NEXT:    [[MUL_PRELOOP:%.*]] = mul nsw i32 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX3_PRELOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV_PRELOOP]]
+; CHECK-NEXT:    br i1 [[CMP1_PRELOOP]], label [[FOR_INC_PRELOOP]], label [[IF_ELSE_PRELOOP:%.*]]
+; CHECK:       if.else.preloop:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX3_PRELOOP]], align 4
+; CHECK-NEXT:    [[ADD_PRELOOP:%.*]] = add nsw i32 [[TMP10]], [[MUL_PRELOOP]]
+; CHECK-NEXT:    br label [[FOR_INC_PRELOOP]]
+; CHECK:       for.inc.preloop:
+; CHECK-NEXT:    [[STOREMERGE_PRELOOP:%.*]] = phi i32 [ [[ADD_PRELOOP]], [[IF_ELSE_PRELOOP]] ], [ [[MUL_PRELOOP]], [[FOR_BODY_PRELOOP]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE_PRELOOP]], ptr [[ARRAYIDX3_PRELOOP]], align 4
+; CHECK-NEXT:    [[DEC_PRELOOP]] = add nsw i32 [[IV_PRELOOP]], -1
+; CHECK-NEXT:    [[CMP_PRELOOP:%.*]] = icmp ne i32 [[DEC_PRELOOP]], [[M]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[DEC_PRELOOP]], [[EXIT_PRELOOP_AT]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[FOR_BODY_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP13:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK:       preloop.exit.selector:
+; CHECK-NEXT:    [[DEC_PRELOOP_LCSSA:%.*]] = phi i32 [ [[DEC_PRELOOP]], [[FOR_INC_PRELOOP]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[DEC_PRELOOP_LCSSA]], [[M]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRELOOP_PSEUDO_EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       preloop.pseudo.exit:
+; CHECK-NEXT:    [[IV_PRELOOP_COPY]] = phi i32 [ 1024, [[FOR_BODY_PREHEADER]] ], [ [[DEC_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    [[INDVAR_END]] = phi i32 [ 1024, [[FOR_BODY_PREHEADER]] ], [ [[DEC_PRELOOP_LCSSA]], [[PRELOOP_EXIT_SELECTOR]] ]
+; CHECK-NEXT:    br label [[MAINLOOP]]
+; CHECK:       postloop:
+; CHECK-NEXT:    br label [[FOR_BODY_POSTLOOP:%.*]]
+; CHECK:       for.body.postloop:
+; CHECK-NEXT:    [[IV_POSTLOOP:%.*]] = phi i32 [ [[DEC_POSTLOOP:%.*]], [[FOR_INC_POSTLOOP:%.*]] ], [ [[IV_COPY]], [[POSTLOOP]] ]
+; CHECK-NEXT:    [[CMP1_POSTLOOP:%.*]] = icmp slt i32 [[IV_POSTLOOP]], 1024
+; CHECK-NEXT:    [[ARRAYIDX_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV_POSTLOOP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV_POSTLOOP]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX2_POSTLOOP]], align 4
+; CHECK-NEXT:    [[MUL_POSTLOOP:%.*]] = mul nsw i32 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX3_POSTLOOP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV_POSTLOOP]]
+; CHECK-NEXT:    br i1 [[CMP1_POSTLOOP]], label [[FOR_INC_POSTLOOP]], label [[IF_ELSE_POSTLOOP:%.*]]
+; CHECK:       if.else.postloop:
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[ADD_POSTLOOP:%.*]] = add nsw i32 [[TMP15]], [[MUL_POSTLOOP]]
+; CHECK-NEXT:    br label [[FOR_INC_POSTLOOP]]
+; CHECK:       for.inc.postloop:
+; CHECK-NEXT:    [[STOREMERGE_POSTLOOP:%.*]] = phi i32 [ [[ADD_POSTLOOP]], [[IF_ELSE_POSTLOOP]] ], [ [[MUL_POSTLOOP]], [[FOR_BODY_POSTLOOP]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE_POSTLOOP]], ptr [[ARRAYIDX3_POSTLOOP]], align 4
+; CHECK-NEXT:    [[DEC_POSTLOOP]] = add nsw i32 [[IV_POSTLOOP]], -1
+; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp ne i32 [[DEC_POSTLOOP]], [[M]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[FOR_BODY_POSTLOOP]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]], !loop_constrainer.loop.clone !5
+;
 entry:
   %cmp14 = icmp slt i32 %M, 1024
   br i1 %cmp14, label %for.body, label %for.cond.cleanup
@@ -323,7 +942,40 @@ for.inc:                                          ; preds = %for.body, %if.else
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
+;; Negative test
 define void @signed_var_imm_dec_eq(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %M) {
+; CHECK-LABEL: define void @signed_var_imm_dec_eq(
+; CHECK-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], ptr nocapture readonly [[C:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp slt i32 [[M]], 1024
+; CHECK-NEXT:    br i1 [[CMP14]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_INC:%.*]] ], [ 1024, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[IV]], 1024
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_INC]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[MUL]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[ADD]], [[IF_ELSE]] ], [ [[MUL]], [[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[IV]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], [[M]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+;
 entry:
   %cmp14 = icmp slt i32 %M, 1024
   br i1 %cmp14, label %for.body, label %for.cond.cleanup
@@ -355,9 +1007,25 @@ for.inc:                                          ; preds = %for.body, %if.else
   br i1 %cmp, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: @test_dec_bound_with_smaller_start_than_bound(
-; CHECK-NOT:       preloop.exit.selector:
+;; Negative test
 define void @test_dec_bound_with_smaller_start_than_bound(i64 %0) {
+; CHECK-LABEL: define void @test_dec_bound_with_smaller_start_than_bound(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[DEC:%.*]], [[FOR_DEC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i64 [[IV]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_ELSE:%.*]], label [[FOR_DEC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    br label [[FOR_DEC]]
+; CHECK:       for.dec:
+; CHECK-NEXT:    [[DEC]] = sub nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i64 [[DEC]], 1
+; CHECK-NEXT:    br i1 [[TMP2]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -378,9 +1046,25 @@ exit:                                               ; preds = %for.dec
   ret void
 }
 
-; CHECK-LABEL: @test_inc_bound_with_bigger_start_than_bound(
-; CHECK-NOT:       main.exit.selector:
+;; Negative test
 define void @test_inc_bound_with_bigger_start_than_bound(i32 %0) {
+; CHECK-LABEL: define void @test_inc_bound_with_bigger_start_than_bound(
+; CHECK-SAME: i32 [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 200, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[IV]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_ELSE:%.*]], label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[INC]], 100
+; CHECK-NEXT:    br i1 [[TMP2]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 

>From 015c06ade023362ba7410e06618dea407fa07e5f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 3 Nov 2023 08:42:59 -0700
Subject: [PATCH 40/76] Regenerate a couple scev/indvars tests [nfc]

Update to modern output to reduce spurious deltas in upcoming change.
---
 .../trip-count-negative-stride.ll             | 42 +++++++++----------
 .../Transforms/IndVarSimplify/cycled_phis.ll  |  4 +-
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-negative-stride.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-negative-stride.ll
index b238e55bf83eb71..710f47649b1db07 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-negative-stride.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-negative-stride.ll
@@ -63,7 +63,7 @@ define void @ult_infinite_ub() mustprogress {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 1
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 1
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 2
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 2
 ;
 entry:
   br label %for.body
@@ -88,7 +88,7 @@ define void @ult_129_not_taken() {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 0
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 entry:
   br label %for.body
@@ -111,7 +111,7 @@ define void @ult_129_unknown_start(i8 %start) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 0
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 entry:
   br label %for.body
@@ -163,7 +163,7 @@ define void @ult_ub1() {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 2
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 2
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 3
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 3
 ;
 entry:
   br label %for.body
@@ -188,7 +188,7 @@ define void @ult_ub2() {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 0
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 entry:
   br label %for.body
@@ -213,7 +213,7 @@ define void @ult_129_preinc() {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 1
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 1
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 2
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 2
 ;
 entry:
   br label %for.body
@@ -236,7 +236,7 @@ define void @ult_preinc(i8 %step) {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 1
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 1
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 2
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 2
 ;
 entry:
   %assume = icmp ult i8 128, %step
@@ -313,7 +313,7 @@ define void @slt_wrap() {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 63
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 63
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 64
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 64
 ;
 entry:
   br label %for.body
@@ -361,7 +361,7 @@ define void @slt_infinite_ub() mustprogress {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 0
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 entry:
   br label %for.body
@@ -386,7 +386,7 @@ define void @slt_129_not_taken() {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 0
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 entry:
   br label %for.body
@@ -433,7 +433,7 @@ define void @slt_129_unknown_start(i8 %start) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (((127 + (-1 * (1 umin (127 + (-1 * %start) + (0 smax (-127 + %start)<nsw>))))<nuw><nsw> + (-1 * %start) + (0 smax (-127 + %start)<nsw>)) /u -127) + (1 umin (127 + (-1 * %start) + (0 smax (-127 + %start)<nsw>))))
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (((127 + (-1 * (1 umin (127 + (-1 * %start) + (0 smax (-127 + %start)<nsw>))))<nuw><nsw> + (-1 * %start) + (0 smax (-127 + %start)<nsw>)) /u -127) + (1 umin (127 + (-1 * %start) + (0 smax (-127 + %start)<nsw>))))
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 entry:
   br label %for.body
@@ -459,7 +459,7 @@ define void @slt_ub1() {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is false
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is false
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 entry:
   br label %for.body
@@ -484,7 +484,7 @@ define void @slt_ub2() {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is false
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is false
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
 ;
 entry:
   br label %for.body
@@ -509,7 +509,7 @@ define void @slt_129_preinc() {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 1
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 1
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 2
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 2
 ;
 entry:
   br label %for.body
@@ -532,7 +532,7 @@ define void @slt_preinc(i8 %step) {
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is 1
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is 1
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %for.body: Trip multiple is 2
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 2
 ;
 entry:
   %assume = icmp ult i8 128, %step
@@ -606,13 +606,13 @@ define void @step_is_neg_addrec_slt_8(i64 %n) {
 ; CHECK-NEXT:  Loop %inner: symbolic max backedge-taken count is (7 /u {0,+,-1}<nuw><nsw><%outer.header>)
 ; CHECK-NEXT:  Loop %inner: Predicated backedge-taken count is (7 /u {0,+,-1}<nuw><nsw><%outer.header>)
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %inner: Trip multiple is 1
+; CHECK-NEXT:  Loop %inner: Trip multiple is 1
 ; CHECK-NEXT:  Loop %outer.header: backedge-taken count is 0
 ; CHECK-NEXT:  Loop %outer.header: constant max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %outer.header: symbolic max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %outer.header: Predicated backedge-taken count is 0
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %outer.header: Trip multiple is 1
+; CHECK-NEXT:  Loop %outer.header: Trip multiple is 1
 ;
 entry:
   br label %outer.header
@@ -648,13 +648,13 @@ define void @step_is_neg_addrec_slt_var(i32 %n) {
 ; CHECK-NEXT:  Loop %inner: symbolic max backedge-taken count is ({0,+,1}<nuw><nsw><%outer.header> + ({0,+,-1}<nsw><%outer.header> smax %n))
 ; CHECK-NEXT:  Loop %inner: Predicated backedge-taken count is ({0,+,1}<nuw><nsw><%outer.header> + ({0,+,-1}<nsw><%outer.header> smax %n))
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %inner: Trip multiple is 1
+; CHECK-NEXT:  Loop %inner: Trip multiple is 1
 ; CHECK-NEXT:  Loop %outer.header: backedge-taken count is 0
 ; CHECK-NEXT:  Loop %outer.header: constant max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %outer.header: symbolic max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %outer.header: Predicated backedge-taken count is 0
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %outer.header: Trip multiple is 1
+; CHECK-NEXT:  Loop %outer.header: Trip multiple is 1
 ;
 entry:
   br label %outer.header
@@ -690,13 +690,13 @@ define void @step_is_neg_addrec_unknown_start(i32 %n) {
 ; CHECK-NEXT:  Loop %inner: symbolic max backedge-taken count is ({(-1 * %n),+,1}<nw><%outer.header> + (8 smax {%n,+,-1}<nsw><%outer.header>))
 ; CHECK-NEXT:  Loop %inner: Predicated backedge-taken count is ({(-1 * %n),+,1}<nw><%outer.header> + (8 smax {%n,+,-1}<nsw><%outer.header>))
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %inner: Trip multiple is 1
+; CHECK-NEXT:  Loop %inner: Trip multiple is 1
 ; CHECK-NEXT:  Loop %outer.header: backedge-taken count is 0
 ; CHECK-NEXT:  Loop %outer.header: constant max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %outer.header: symbolic max backedge-taken count is 0
 ; CHECK-NEXT:  Loop %outer.header: Predicated backedge-taken count is 0
 ; CHECK-NEXT:   Predicates:
-; CHECK:       Loop %outer.header: Trip multiple is 1
+; CHECK-NEXT:  Loop %outer.header: Trip multiple is 1
 ;
 entry:
   br label %outer.header
diff --git a/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll b/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll
index d8f5b0c3af9d815..22f98720520ebae 100644
--- a/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll
+++ b/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll
@@ -480,8 +480,8 @@ define i32 @start.from.sibling.iv.wide.cycled.phis.complex.phis(ptr %len.ptr, pt
 ; CHECK-NEXT:    [[IV_LCSSA2:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    [[SWITCH_COND:%.*]] = call i32 @switch.cond()
 ; CHECK-NEXT:    switch i32 [[SWITCH_COND]], label [[TAKE_SAME:%.*]] [
-; CHECK-NEXT:    i32 1, label [[TAKE_INCREMENT:%.*]]
-; CHECK-NEXT:    i32 2, label [[TAKE_SMAX:%.*]]
+; CHECK-NEXT:      i32 1, label [[TAKE_INCREMENT:%.*]]
+; CHECK-NEXT:      i32 2, label [[TAKE_SMAX:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       take.same:
 ; CHECK-NEXT:    br label [[OUTER_LOOP_BACKEDGE]]

>From 1e39575a981088e8596461a3511cce3ec4c3b274 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Fri, 3 Nov 2023 09:03:52 -0700
Subject: [PATCH 41/76] [RISCV] CSE by swapping conditional branches (#71111)

DAGCombiner, as well as InstCombine, tend to canonicalize GE/LE into
GT/LT, namely:
```
X >= C --> X > (C - 1)
```
Which sometime generates off-by-one constants that could have been CSE'd
with surrounding constants.
Instead of changing such canonicalization, this patch tries to swap
those branch conditions post-isel, in the hope of resurfacing more
constant CSE opportunities. More specifically, it performs the following
optimization:

For two constants C0 and C1 from
```
li Y, C0
li Z, C1
```
To remove redundnat `li Y, C0`,
 1. if C1 = C0 + 1 we can turn:
    (a) blt Y, X -> bge X, Z
    (b) bge Y, X -> blt X, Z
 2. if C1 = C0 - 1 we can turn:
    (a) blt X, Y -> bge Z, X
    (b) bge X, Y -> blt Z, X

This optimization will be done by PeepholeOptimizer through
RISCVInstrInfo::optimizeCondBranch.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 119 +++++++++++++++++++++++
 llvm/lib/Target/RISCV/RISCVInstrInfo.h   |   2 +
 llvm/test/CodeGen/RISCV/branch-opt.ll    | 119 +++++++++++++++++++++++
 3 files changed, 240 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/branch-opt.ll

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 412fb7e7f7fc16c..1d19faa4e06e8a4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1159,6 +1159,125 @@ bool RISCVInstrInfo::reverseBranchCondition(
   return false;
 }
 
+bool RISCVInstrInfo::optimizeCondBranch(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  MachineBasicBlock *TBB, *FBB;
+  SmallVector<MachineOperand, 3> Cond;
+  if (analyzeBranch(*MBB, TBB, FBB, Cond, /*AllowModify=*/false))
+    return false;
+  (void)FBB;
+
+  RISCVCC::CondCode CC = static_cast<RISCVCC::CondCode>(Cond[0].getImm());
+  assert(CC != RISCVCC::COND_INVALID);
+
+  if (CC == RISCVCC::COND_EQ || CC == RISCVCC::COND_NE)
+    return false;
+
+  // For two constants C0 and C1 from
+  // ```
+  // li Y, C0
+  // li Z, C1
+  // ```
+  // 1. if C1 = C0 + 1
+  // we can turn:
+  //  (a) blt Y, X -> bge X, Z
+  //  (b) bge Y, X -> blt X, Z
+  //
+  // 2. if C1 = C0 - 1
+  // we can turn:
+  //  (a) blt X, Y -> bge Z, X
+  //  (b) bge X, Y -> blt Z, X
+  //
+  // To make sure this optimization is really beneficial, we only
+  // optimize for cases where Y had only one use (i.e. only used by the branch).
+
+  // Right now we only care about LI (i.e. ADDI x0, imm)
+  auto isLoadImm = [](const MachineInstr *MI, int64_t &Imm) -> bool {
+    if (MI->getOpcode() == RISCV::ADDI && MI->getOperand(1).isReg() &&
+        MI->getOperand(1).getReg() == RISCV::X0) {
+      Imm = MI->getOperand(2).getImm();
+      return true;
+    }
+    return false;
+  };
+  // Either a load from immediate instruction or X0.
+  auto isFromLoadImm = [&](const MachineOperand &Op, int64_t &Imm) -> bool {
+    if (!Op.isReg())
+      return false;
+    Register Reg = Op.getReg();
+    if (Reg == RISCV::X0) {
+      Imm = 0;
+      return true;
+    }
+    if (!Reg.isVirtual())
+      return false;
+    return isLoadImm(MRI.getVRegDef(Op.getReg()), Imm);
+  };
+
+  MachineOperand &LHS = MI.getOperand(0);
+  MachineOperand &RHS = MI.getOperand(1);
+  // Try to find the register for constant Z; return
+  // invalid register otherwise.
+  auto searchConst = [&](int64_t C1) -> Register {
+    MachineBasicBlock::reverse_iterator II(&MI), E = MBB->rend();
+    auto DefC1 = std::find_if(++II, E, [&](const MachineInstr &I) -> bool {
+      int64_t Imm;
+      return isLoadImm(&I, Imm) && Imm == C1;
+    });
+    if (DefC1 != E)
+      return DefC1->getOperand(0).getReg();
+
+    return Register();
+  };
+
+  bool Modify = false;
+  int64_t C0;
+  if (isFromLoadImm(LHS, C0) && MRI.hasOneUse(LHS.getReg())) {
+    // Might be case 1.
+    // Signed integer overflow is UB. (UINT64_MAX is bigger so we don't need
+    // to worry about unsigned overflow here)
+    if (C0 < INT64_MAX)
+      if (Register RegZ = searchConst(C0 + 1)) {
+        reverseBranchCondition(Cond);
+        Cond[1] = MachineOperand::CreateReg(RHS.getReg(), /*isDef=*/false);
+        Cond[2] = MachineOperand::CreateReg(RegZ, /*isDef=*/false);
+        // We might extend the live range of Z, clear its kill flag to
+        // account for this.
+        MRI.clearKillFlags(RegZ);
+        Modify = true;
+      }
+  } else if (isFromLoadImm(RHS, C0) && MRI.hasOneUse(RHS.getReg())) {
+    // Might be case 2.
+    // For unsigned cases, we don't want C1 to wrap back to UINT64_MAX
+    // when C0 is zero.
+    if ((CC == RISCVCC::COND_GE || CC == RISCVCC::COND_LT) || C0)
+      if (Register RegZ = searchConst(C0 - 1)) {
+        reverseBranchCondition(Cond);
+        Cond[1] = MachineOperand::CreateReg(RegZ, /*isDef=*/false);
+        Cond[2] = MachineOperand::CreateReg(LHS.getReg(), /*isDef=*/false);
+        // We might extend the live range of Z, clear its kill flag to
+        // account for this.
+        MRI.clearKillFlags(RegZ);
+        Modify = true;
+      }
+  }
+
+  if (!Modify)
+    return false;
+
+  // Build the new branch and remove the old one.
+  BuildMI(*MBB, MI, MI.getDebugLoc(),
+          getBrCond(static_cast<RISCVCC::CondCode>(Cond[0].getImm())))
+      .add(Cond[1])
+      .add(Cond[2])
+      .addMBB(TBB);
+  MI.eraseFromParent();
+
+  return true;
+}
+
 MachineBasicBlock *
 RISCVInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
   assert(MI.getDesc().isBranch() && "Unexpected opcode!");
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index d0112a464677ab5..491278c2e017e7c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -121,6 +121,8 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
+  bool optimizeCondBranch(MachineInstr &MI) const override;
+
   MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
 
   bool isBranchOffsetInRange(unsigned BranchOpc,
diff --git a/llvm/test/CodeGen/RISCV/branch-opt.ll b/llvm/test/CodeGen/RISCV/branch-opt.ll
new file mode 100644
index 000000000000000..e4912bbf861a2a0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/branch-opt.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=riscv32 -O2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -O2 -verify-machineinstrs < %s | FileCheck %s
+
+define void @u_case1_a(ptr %a, i32 signext %b, ptr %c, ptr %d) {
+; CHECK-LABEL: u_case1_a:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a4, 32
+; CHECK-NEXT:    sw a4, 0(a0)
+; CHECK-NEXT:    bgeu a1, a4, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %block1
+; CHECK-NEXT:    sw a1, 0(a2)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: # %block2
+; CHECK-NEXT:    li a0, 87
+; CHECK-NEXT:    sw a0, 0(a3)
+; CHECK-NEXT:    ret
+  store i32 32, ptr %a
+  %p = icmp ule i32 %b, 31
+  br i1 %p, label %block1, label %block2
+
+block1:                                           ; preds = %0
+  store i32 %b, ptr %c
+  br label %end_block
+
+block2:                                           ; preds = %0
+  store i32 87, ptr %d
+  br label %end_block
+
+end_block:                                        ; preds = %block2, %block1
+  ret void
+}
+
+define void @case1_a(ptr %a, i32 signext %b, ptr %c, ptr %d) {
+; CHECK-LABEL: case1_a:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a4, -1
+; CHECK-NEXT:    sw a4, 0(a0)
+; CHECK-NEXT:    bge a1, a4, .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %block1
+; CHECK-NEXT:    sw a1, 0(a2)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_2: # %block2
+; CHECK-NEXT:    li a0, 87
+; CHECK-NEXT:    sw a0, 0(a3)
+; CHECK-NEXT:    ret
+  store i32 -1, ptr %a
+  %p = icmp sle i32 %b, -2
+  br i1 %p, label %block1, label %block2
+
+block1:                                           ; preds = %0
+  store i32 %b, ptr %c
+  br label %end_block
+
+block2:                                           ; preds = %0
+  store i32 87, ptr %d
+  br label %end_block
+
+end_block:                                        ; preds = %block2, %block1
+  ret void
+}
+
+define void @u_case2_a(ptr %a, i32 signext %b, ptr %c, ptr %d) {
+; CHECK-LABEL: u_case2_a:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a4, 32
+; CHECK-NEXT:    sw a4, 0(a0)
+; CHECK-NEXT:    bgeu a4, a1, .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %block1
+; CHECK-NEXT:    sw a1, 0(a2)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2: # %block2
+; CHECK-NEXT:    li a0, 87
+; CHECK-NEXT:    sw a0, 0(a3)
+; CHECK-NEXT:    ret
+  store i32 32, ptr %a
+  %p = icmp uge i32 %b, 33
+  br i1 %p, label %block1, label %block2
+
+block1:                                           ; preds = %0
+  store i32 %b, ptr %c
+  br label %end_block
+
+block2:                                           ; preds = %0
+  store i32 87, ptr %d
+  br label %end_block
+
+end_block:                                        ; preds = %block2, %block1
+  ret void
+}
+
+define void @case2_a(ptr %a, i32 signext %b, ptr %c, ptr %d) {
+; CHECK-LABEL: case2_a:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a4, -4
+; CHECK-NEXT:    sw a4, 0(a0)
+; CHECK-NEXT:    bge a4, a1, .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %block1
+; CHECK-NEXT:    sw a1, 0(a2)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_2: # %block2
+; CHECK-NEXT:    li a0, 87
+; CHECK-NEXT:    sw a0, 0(a3)
+; CHECK-NEXT:    ret
+  store i32 -4, ptr %a
+  %p = icmp sge i32 %b, -3
+  br i1 %p, label %block1, label %block2
+
+block1:                                           ; preds = %0
+  store i32 %b, ptr %c
+  br label %end_block
+
+block2:                                           ; preds = %0
+  store i32 87, ptr %d
+  br label %end_block
+
+end_block:                                        ; preds = %block2, %block1
+  ret void
+}

>From a6c8e27b3a052913a15a13ee0d4ac466c5ab3f92 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Tue, 31 Oct 2023 08:45:06 -0700
Subject: [PATCH 42/76] [IndVars] Generate zext nneg when locally obvious

zext nneg was recently added to the IR in #67982.  This patch teaches
SimplifyIndVars to prefer zext nneg over *both* sext and plain zext,
when a local SCEV query indicates the source is non-negative.

The choice to prefer zext nneg over sext looks slightly aggressive
here, but probably isn't so much in practice.  For cases where we'd
"remember" the range fact, instcombine would convert the sext into
a zext nneg anyways.  The only cases where this produces a different
result overall are when SCEV knows a non-local fact, and it doesn't
get materialized into the IR.  Those are exactly the cases where
using zext nneg are most useful.  We do run the risk of e.g. a
missing combine - since we haven't updated most of them yet - but
that seems like a manageable risk.

Note that there are much deeper algorithmic changes we could make
to this code to exploit zext nneg, but this seemed like a reasonable
and low risk starting point.
---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp  | 19 +++++++++++++++++++
 llvm/test/Analysis/ScalarEvolution/guards.ll  |  2 +-
 .../Transforms/IndVarSimplify/X86/pr59615.ll  |  2 +-
 .../IndVarSimplify/post-inc-range.ll          |  2 +-
 llvm/test/Transforms/LoopFlatten/widen-iv2.ll |  2 +-
 llvm/test/Transforms/LoopFlatten/widen-iv3.ll |  2 +-
 6 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index ae3644183a735bc..de2556f3cec19c9 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1201,6 +1201,15 @@ Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
        L = L->getParentLoop())
     Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
 
+  // If we know the operand is never negative, prefer zext nneg.
+  // For constant expressions, fall back to plain sext or zext.
+  if (SE->isKnownNonNegative(SE->getSCEV(NarrowOper))) {
+    auto *Res = Builder.CreateZExt(NarrowOper, WideType);
+    if (auto *I = dyn_cast<Instruction>(Res))
+      I->setNonNeg(true);
+    return Res;
+  }
+
   return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
                     Builder.CreateZExt(NarrowOper, WideType);
 }
@@ -1686,6 +1695,16 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) {
     auto ExtendedOp = [&](Value * V)->Value * {
       if (V == NarrowUse)
         return WideBO;
+
+      // If we know the operand is never negative, prefer zext nneg.
+      // For constant expressions, fall back to plain sext or zext.
+      if (SE->isKnownNonNegative(SE->getSCEV(V))) {
+        auto *Res = Builder.CreateZExt(V, WideBO->getType());
+        if (auto *I = dyn_cast<Instruction>(Res))
+          I->setNonNeg(true);
+        return Res;
+      }
+
       if (ExtKind == ExtendKind::Zero)
         return Builder.CreateZExt(V, WideBO->getType());
       else
diff --git a/llvm/test/Analysis/ScalarEvolution/guards.ll b/llvm/test/Analysis/ScalarEvolution/guards.ll
index ea17c5840067afb..137630cd25e6873 100644
--- a/llvm/test/Analysis/ScalarEvolution/guards.ll
+++ b/llvm/test/Analysis/ScalarEvolution/guards.ll
@@ -57,7 +57,7 @@ define void @test_2(i32 %n, ptr %len_buf) {
 ; CHECK-SAME: (i32 [[N:%.*]], ptr [[LEN_BUF:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_BUF]], align 4, !range [[RNG1:![0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[LEN]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[N]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr59615.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr59615.ll
index 17b7b9d40b07a53..4fe7f7fd01a0660 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/pr59615.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/pr59615.ll
@@ -17,7 +17,7 @@ define void @test() {
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb8:
 ; CHECK-NEXT:    [[VAR9:%.*]] = load atomic i32, ptr addrspace(1) poison unordered, align 8, !range [[RNG0]], !invariant.load !1, !noundef !1
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[VAR9]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[VAR9]] to i64
 ; CHECK-NEXT:    [[VAR10:%.*]] = icmp ult i64 [[INDVARS_IV]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[VAR10]], label [[BB12]], label [[BB11:%.*]]
 ; CHECK:       bb11:
diff --git a/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll b/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll
index 5c22ba1044b60af..1df0d62168af24e 100644
--- a/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll
+++ b/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll
@@ -120,7 +120,7 @@ define void @test_range_metadata(ptr %array_length_ptr, ptr %base,
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH:%.*]] ]
 ; CHECK-NEXT:    [[ARRAY_LENGTH:%.*]] = load i32, ptr [[ARRAY_LENGTH_PTR:%.*]], align 4, !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[ARRAY_LENGTH]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext nneg i32 [[ARRAY_LENGTH]] to i64
 ; CHECK-NEXT:    [[WITHIN_LIMITS:%.*]] = icmp ult i64 [[INDVARS_IV]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[WITHIN_LIMITS]], label [[CONTINUE:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       continue:
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
index 946b98420249e2f..f4c8b90d4bc27b0 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
@@ -39,7 +39,7 @@ define dso_local i32 @fn1() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY3_US]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ]
 ; CHECK-NEXT:    [[J_014_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY3_US]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVAR]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[J_014_US]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = zext nneg i32 [[J_014_US]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw i64 [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[J_014_US]], [[MUL_US]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
index df8ee6ff0750574..b3a9ac823fd2df7 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
@@ -18,7 +18,7 @@ define i16 @foo() {
 ; CHECK-NEXT:    [[SUM_012:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[ADD5_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul nsw i32 [[INDVAR2]], 16
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i16 [[I_013]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[MUL]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i16 [[MUL]] to i32
 ; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[ADD5_LCSSA_LCSSA:%.*]] = phi i16 [ [[ADD5_LCSSA]], [[FOR_COND_CLEANUP3]] ]

>From 17798ad7c3596d9b74b5f55ac41f10fe01e8ee4b Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn at google.com>
Date: Fri, 3 Nov 2023 09:26:47 -0700
Subject: [PATCH 43/76] [lldb] Use get-task-allow entitlement on macOS too
 (#71112)

Running the LLDB test suite in a GUI-less macOS environment (say, ssh)
requires that the debugged tasks be signed with the get-task-allow
entitlement.
---
 .../Python/lldbsuite/test/builders/darwin.py      | 15 ++++++++-------
 .../lldbsuite/test/make/entitlements-macos.plist  |  8 ++++++++
 2 files changed, 16 insertions(+), 7 deletions(-)
 create mode 100644 lldb/packages/Python/lldbsuite/test/make/entitlements-macos.plist

diff --git a/lldb/packages/Python/lldbsuite/test/builders/darwin.py b/lldb/packages/Python/lldbsuite/test/builders/darwin.py
index 40dd13bcfdea1d6..a023bda3ad80103 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/darwin.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/darwin.py
@@ -88,17 +88,18 @@ def getExtraMakeArgs(self):
                 args["FRAMEWORK_INCLUDES"] = "-F{}".format(private_frameworks)
 
         operating_system, env = get_os_and_env()
-        if operating_system and operating_system != "macosx":
-            builder_dir = os.path.dirname(os.path.abspath(__file__))
-            test_dir = os.path.dirname(builder_dir)
+
+        builder_dir = os.path.dirname(os.path.abspath(__file__))
+        test_dir = os.path.dirname(builder_dir)
+        if not operating_system:
+            entitlements_file = "entitlements-macos.plist"
+        else:
             if env == "simulator":
                 entitlements_file = "entitlements-simulator.plist"
             else:
                 entitlements_file = "entitlements.plist"
-            entitlements = os.path.join(test_dir, "make", entitlements_file)
-            args["CODESIGN"] = "codesign --entitlements {}".format(entitlements)
-        else:
-            args["CODESIGN"] = "codesign"
+        entitlements = os.path.join(test_dir, "make", entitlements_file)
+        args["CODESIGN"] = "codesign --entitlements {}".format(entitlements)
 
         # Return extra args as a formatted string.
         return ["{}={}".format(key, value) for key, value in args.items()]
diff --git a/lldb/packages/Python/lldbsuite/test/make/entitlements-macos.plist b/lldb/packages/Python/lldbsuite/test/make/entitlements-macos.plist
new file mode 100644
index 000000000000000..9acd12816c91373
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/make/entitlements-macos.plist
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>com.apple.security.get-task-allow</key>
+    <true/>
+</dict>
+</plist>

>From 778a48468b5fce8deafb40be0704cb69b052a50a Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers at users.noreply.github.com>
Date: Fri, 3 Nov 2023 09:37:07 -0700
Subject: [PATCH 44/76] [InlineAsm] Steal a bit to denote a register is
 foldable (#70738)

When using the inline asm constraint string "rm" (or "g"), we generally
would like the compiler to choose "r", but it is permitted to choose "m"
if there's register pressure. This is distinct from "r" in which the
register is not permitted to be spilled to the stack.

The decision of which to use must be made at some point.  Currently, the
instruction selection frameworks (ISELs) make the choice, and the
register allocators had better be able to handle the result.

Steal a bit from Storage when using register operands to disambiguate
between the two cases.  Add helpers/getters/setters, and print in MIR
when such a register is foldable.

The getter will later be used by the register allocation frameworks (and
asserted by the ISELs) while the setters will be used by the instruction
selection frameworks.

Link: https://github.com/llvm/llvm-project/issues/20571
---
 llvm/include/llvm/CodeGen/MachineInstr.h |  4 +++
 llvm/include/llvm/IR/InlineAsm.h         | 35 ++++++++++++++++++++----
 llvm/lib/CodeGen/MachineInstr.cpp        | 23 ++++++++++++++++
 llvm/lib/CodeGen/TargetInstrInfo.cpp     |  4 +++
 4 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 4877f43e8578d1c..bd72ac23fc9c08e 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1364,6 +1364,10 @@ class MachineInstr
     return getOpcode() == TargetOpcode::INLINEASM ||
            getOpcode() == TargetOpcode::INLINEASM_BR;
   }
+  /// Returns true if the register operand can be folded with a load or store
+  /// into a frame index. Does so by checking the InlineAsm::Flag immediate
+  /// operand at OpId - 1.
+  bool mayFoldInlineAsmRegOp(unsigned OpId) const;
 
   bool isStackAligningInlineAsm() const;
   InlineAsm::AsmDialect getInlineAsmDialect() const;
diff --git a/llvm/include/llvm/IR/InlineAsm.h b/llvm/include/llvm/IR/InlineAsm.h
index 969ad42816a7e52..e5f506e5694daf2 100644
--- a/llvm/include/llvm/IR/InlineAsm.h
+++ b/llvm/include/llvm/IR/InlineAsm.h
@@ -291,18 +291,23 @@ class InlineAsm final : public Value {
   //     Bits 30-16 - A ConstraintCode:: value indicating the original
   //                  constraint code. (MemConstraintCode)
   //   Else:
-  //     Bits 30-16 - The register class ID to use for the operand. (RegClass)
+  //     Bits 29-16 - The register class ID to use for the operand. (RegClass)
+  //     Bit  30    - If the register is permitted to be spilled.
+  //                  (RegMayBeFolded)
+  //                  Defaults to false "r", may be set for constraints like
+  //                  "rm" (or "g").
   //
-  //   As such, MatchedOperandNo, MemConstraintCode, and RegClass are views of
-  //   the same slice of bits, but are mutually exclusive depending on the
-  //   fields IsMatched then KindField.
+  //   As such, MatchedOperandNo, MemConstraintCode, and
+  //   (RegClass+RegMayBeFolded) are views of the same slice of bits, but are
+  //   mutually exclusive depending on the fields IsMatched then KindField.
   class Flag {
     uint32_t Storage;
     using KindField = Bitfield::Element<Kind, 0, 3, Kind::Func>;
     using NumOperands = Bitfield::Element<unsigned, 3, 13>;
     using MatchedOperandNo = Bitfield::Element<unsigned, 16, 15>;
     using MemConstraintCode = Bitfield::Element<ConstraintCode, 16, 15, ConstraintCode::Max>;
-    using RegClass = Bitfield::Element<unsigned, 16, 15>;
+    using RegClass = Bitfield::Element<unsigned, 16, 14>;
+    using RegMayBeFolded = Bitfield::Element<bool, 30, 1>;
     using IsMatched = Bitfield::Element<bool, 31, 1>;
 
 
@@ -413,6 +418,26 @@ class InlineAsm final : public Value {
              "Flag is not a memory or function constraint!");
       Bitfield::set<MemConstraintCode>(Storage, ConstraintCode::Unknown);
     }
+
+    /// Set a bit to denote that while this operand is some kind of register
+    /// (use, def, ...), a memory flag did appear in the original constraint
+    /// list.  This is set by the instruction selection framework, and consumed
+    /// by the register allocator. While the register allocator is generally
+    /// responsible for spilling registers, we need to be able to distinguish
+    /// between registers that the register allocator has permission to fold
+    /// ("rm") vs ones it does not ("r"). This is because the inline asm may use
+    /// instructions which don't support memory addressing modes for that
+    /// operand.
+    void setRegMayBeFolded(bool B) {
+      assert((isRegDefKind() || isRegDefEarlyClobberKind() || isRegUseKind()) &&
+             "Must be reg");
+      Bitfield::set<RegMayBeFolded>(Storage, B);
+    }
+    bool getRegMayBeFolded() const {
+      assert((isRegDefKind() || isRegDefEarlyClobberKind() || isRegUseKind()) &&
+             "Must be reg");
+      return Bitfield::get<RegMayBeFolded>(Storage);
+    }
   };
 
   static std::vector<StringRef> getExtraInfoNames(unsigned ExtraInfo) {
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 048563cc2bcc4e4..9e7b4df2576feee 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1792,6 +1792,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       if (F.isUseOperandTiedToDef(TiedTo))
         OS << " tiedto:$" << TiedTo;
 
+      if ((F.isRegDefKind() || F.isRegDefEarlyClobberKind() ||
+           F.isRegUseKind()) &&
+          F.getRegMayBeFolded()) {
+        OS << " foldable";
+      }
+
       OS << ']';
 
       // Compute the index of the next operand descriptor.
@@ -2526,3 +2532,20 @@ void MachineInstr::insert(mop_iterator InsertBefore,
     tieOperands(Tie1, Tie2);
   }
 }
+
+bool MachineInstr::mayFoldInlineAsmRegOp(unsigned OpId) const {
+  assert(OpId && "expected non-zero operand id");
+  assert(isInlineAsm() && "should only be used on inline asm");
+
+  if (!getOperand(OpId).isReg())
+    return false;
+
+  const MachineOperand &MD = getOperand(OpId - 1);
+  if (!MD.isImm())
+    return false;
+
+  InlineAsm::Flag F(MD.getImm());
+  if (F.isRegUseKind() || F.isRegDefKind() || F.isRegDefEarlyClobberKind())
+    return F.getRegMayBeFolded();
+  return false;
+}
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index fe7efb73a2dce83..3013a768bc4d566 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1639,6 +1639,10 @@ std::string TargetInstrInfo::createMIROperandComment(
   if (F.isUseOperandTiedToDef(TiedTo))
     OS << " tiedto:$" << TiedTo;
 
+  if ((F.isRegDefKind() || F.isRegDefEarlyClobberKind() || F.isRegUseKind()) &&
+      F.getRegMayBeFolded())
+    OS << " foldable";
+
   return OS.str();
 }
 

>From 890335bb28b96ab09ce3d6cf9c2e611bcabb36eb Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag at gmail.com>
Date: Fri, 3 Nov 2023 09:41:26 -0700
Subject: [PATCH 45/76] [InstrProf] Do not block functions from PGOUse (#71106)

The `skipPGO()` function was added in https://reviews.llvm.org/D137184.
Unfortunately, it also blocked functions from being annotated (PGOUse),
which I believe will cause confusion to users if a function has a
profile but it is not PGO'd.

The docs for `noprofile` and `skipprofile` only claim to block
instrumentation, not PGO optimization:
https://llvm.org/docs/LangRef.html
---
 .../Instrumentation/PGOInstrumentation.cpp    | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 7ad1c9bc54f3780..49608768f8ba68a 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1760,17 +1760,10 @@ static void collectComdatMembers(
       ComdatMembers.insert(std::make_pair(C, &GA));
 }
 
-// Don't perform PGO instrumeatnion / profile-use.
-static bool skipPGO(const Function &F) {
+// Return true if we should not find instrumentation data for this function
+static bool skipPGOUse(const Function &F) {
   if (F.isDeclaration())
     return true;
-  if (F.hasFnAttribute(llvm::Attribute::NoProfile))
-    return true;
-  if (F.hasFnAttribute(llvm::Attribute::SkipProfile))
-    return true;
-  if (F.getInstructionCount() < PGOFunctionSizeThreshold)
-    return true;
-
   // If there are too many critical edges, PGO might cause
   // compiler time problem. Skip PGO if the number of
   // critical edges execeed the threshold.
@@ -1788,7 +1781,19 @@ static bool skipPGO(const Function &F) {
                       << " exceed the threshold. Skip PGO.\n");
     return true;
   }
+  return false;
+}
 
+// Return true if we should not instrument this function
+static bool skipPGOGen(const Function &F) {
+  if (skipPGOUse(F))
+    return true;
+  if (F.hasFnAttribute(llvm::Attribute::NoProfile))
+    return true;
+  if (F.hasFnAttribute(llvm::Attribute::SkipProfile))
+    return true;
+  if (F.getInstructionCount() < PGOFunctionSizeThreshold)
+    return true;
   return false;
 }
 
@@ -1804,7 +1809,7 @@ static bool InstrumentAllFunctions(
   collectComdatMembers(M, ComdatMembers);
 
   for (auto &F : M) {
-    if (skipPGO(F))
+    if (skipPGOGen(F))
       continue;
     auto &TLI = LookupTLI(F);
     auto *BPI = LookupBPI(F);
@@ -2031,7 +2036,7 @@ static bool annotateAllFunctions(
     InstrumentFuncEntry = PGOInstrumentEntry;
   bool HasSingleByteCoverage = PGOReader->hasSingleByteCoverage();
   for (auto &F : M) {
-    if (skipPGO(F))
+    if (skipPGOUse(F))
       continue;
     auto &TLI = LookupTLI(F);
     auto *BPI = LookupBPI(F);

>From 7ccdad14ed2cab9ff9a7eb76a6cb7ddf10694ddc Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 3 Nov 2023 09:50:01 -0700
Subject: [PATCH 46/76] [RISCV][GISel] Add helper to convert a LLT size to a
 RegisterBankInfo::ValueMapping* for FP. (#71123)

Use this to simplify code.
---
 .../RISCV/GISel/RISCVRegisterBankInfo.cpp     | 43 ++++++-------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index f005948d2094445..99d3000568a9f17 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -103,6 +103,12 @@ RISCVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   }
 }
 
+static const RegisterBankInfo::ValueMapping *getFPValueMapping(unsigned Size) {
+  assert(Size == 32 || Size == 64);
+  unsigned Idx = Size == 64 ? RISCV::FPR64Idx : RISCV::FPR32Idx;
+  return &RISCV::ValueMappings[Idx];
+}
+
 const RegisterBankInfo::InstructionMapping &
 RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const unsigned Opc = MI.getOpcode();
@@ -185,47 +191,26 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINNUM: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    OperandsMapping = Ty.getSizeInBits() == 64
-                          ? &RISCV::ValueMappings[RISCV::FPR64Idx]
-                          : &RISCV::ValueMappings[RISCV::FPR32Idx];
+    OperandsMapping = getFPValueMapping(Ty.getSizeInBits());
     break;
   }
   case TargetOpcode::G_FMA: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    OperandsMapping =
-        Ty.getSizeInBits() == 64
-            ? getOperandsMapping({&RISCV::ValueMappings[RISCV::FPR64Idx],
-                                  &RISCV::ValueMappings[RISCV::FPR64Idx],
-                                  &RISCV::ValueMappings[RISCV::FPR64Idx],
-                                  &RISCV::ValueMappings[RISCV::FPR64Idx]})
-            : getOperandsMapping({&RISCV::ValueMappings[RISCV::FPR32Idx],
-                                  &RISCV::ValueMappings[RISCV::FPR32Idx],
-                                  &RISCV::ValueMappings[RISCV::FPR32Idx],
-                                  &RISCV::ValueMappings[RISCV::FPR32Idx]});
-    break;
-  }
-  case TargetOpcode::G_FPEXT: {
-    LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
-    (void)ToTy;
-    LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
-    (void)FromTy;
-    assert(ToTy.getSizeInBits() == 64 && FromTy.getSizeInBits() == 32 &&
-           "Unsupported size for G_FPEXT");
-    OperandsMapping =
-        getOperandsMapping({&RISCV::ValueMappings[RISCV::FPR64Idx],
-                            &RISCV::ValueMappings[RISCV::FPR32Idx]});
+    const RegisterBankInfo::ValueMapping *FPValueMapping =
+        getFPValueMapping(Ty.getSizeInBits());
+    OperandsMapping = getOperandsMapping(
+        {FPValueMapping, FPValueMapping, FPValueMapping, FPValueMapping});
     break;
   }
+  case TargetOpcode::G_FPEXT:
   case TargetOpcode::G_FPTRUNC: {
     LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
     (void)ToTy;
     LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
     (void)FromTy;
-    assert(ToTy.getSizeInBits() == 32 && FromTy.getSizeInBits() == 64 &&
-           "Unsupported size for G_FPTRUNC");
     OperandsMapping =
-        getOperandsMapping({&RISCV::ValueMappings[RISCV::FPR32Idx],
-                            &RISCV::ValueMappings[RISCV::FPR64Idx]});
+        getOperandsMapping({getFPValueMapping(ToTy.getSizeInBits()),
+                            getFPValueMapping(FromTy.getSizeInBits())});
     break;
   }
   default:

>From 8d24d3900ec3f28902b2fad4a2c2c2b789257424 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 3 Nov 2023 10:08:00 -0700
Subject: [PATCH 47/76] [Mips] In LowerShift*Parts, xor with bits-1 instead of
 -1. (#71149)

If we start with an i128 shift, the initial shift amount would usually
have zeros in bit 8 and above. xoring the shift amount with -1 will set
those upper bits to 1. If DAGCombiner is able to prove those bits are
now 1, then the shift that uses the xor will be replaced with undef.
Which we don't want.

Reduce the xor constant to VT.bits-1 where VT is half the size of the
larger shift type. This avoids toggling the upper bits. The hardware
shift instruction only uses the lower bits of the shift amount. I assume
the code used NOT because the hardware doesn't use the upper bits, but
that isn't compatible with the LLVM poison semantics.

Fixes #71142.
---
 llvm/lib/Target/Mips/MipsISelLowering.cpp | 14 ++++++------
 llvm/test/CodeGen/Mips/llvm-ir/ashr.ll    | 24 ++++++++++-----------
 llvm/test/CodeGen/Mips/llvm-ir/lshr.ll    | 26 +++++++++++------------
 llvm/test/CodeGen/Mips/llvm-ir/shl.ll     | 26 +++++++++++------------
 4 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 061d035b7e246c7..f09d52aa5fd6410 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -2593,12 +2593,13 @@ SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
   SDValue Shamt = Op.getOperand(2);
   // if shamt < (VT.bits):
   //  lo = (shl lo, shamt)
-  //  hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt))
+  //  hi = (or (shl hi, shamt) (srl (srl lo, 1), (xor shamt, (VT.bits-1))))
   // else:
   //  lo = 0
   //  hi = (shl lo, shamt[4:0])
-  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
-                            DAG.getConstant(-1, DL, MVT::i32));
+  SDValue Not =
+      DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                  DAG.getConstant(VT.getSizeInBits() - 1, DL, MVT::i32));
   SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo,
                                       DAG.getConstant(1, DL, VT));
   SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, Not);
@@ -2623,7 +2624,7 @@ SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
   MVT VT = Subtarget.isGP64bit() ? MVT::i64 : MVT::i32;
 
   // if shamt < (VT.bits):
-  //  lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt))
+  //  lo = (or (shl (shl hi, 1), (xor shamt, (VT.bits-1))) (srl lo, shamt))
   //  if isSRA:
   //    hi = (sra hi, shamt)
   //  else:
@@ -2635,8 +2636,9 @@ SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
   //  else:
   //   lo = (srl hi, shamt[4:0])
   //   hi = 0
-  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
-                            DAG.getConstant(-1, DL, MVT::i32));
+  SDValue Not =
+      DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                  DAG.getConstant(VT.getSizeInBits() - 1, DL, MVT::i32));
   SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, VT, Hi,
                                      DAG.getConstant(1, DL, VT));
   SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, ShiftLeft1Hi, Not);
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 47d18b9b5c533e9..453ca0d6bab3b18 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -280,7 +280,7 @@ define signext i64 @ashr_i64(i64 signext %a, i64 signext %b) {
 ; MIPS-NEXT:    srav $3, $4, $7
 ; MIPS-NEXT:  # %bb.1: # %entry
 ; MIPS-NEXT:    srlv $1, $5, $7
-; MIPS-NEXT:    not $2, $7
+; MIPS-NEXT:    xori $2, $7, 31
 ; MIPS-NEXT:    sll $4, $4, 1
 ; MIPS-NEXT:    sllv $2, $4, $2
 ; MIPS-NEXT:    or $1, $2, $1
@@ -294,7 +294,7 @@ define signext i64 @ashr_i64(i64 signext %a, i64 signext %b) {
 ; MIPS32-LABEL: ashr_i64:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    srlv $1, $5, $7
-; MIPS32-NEXT:    not $2, $7
+; MIPS32-NEXT:    xori $2, $7, 31
 ; MIPS32-NEXT:    sll $3, $4, 1
 ; MIPS32-NEXT:    sllv $2, $3, $2
 ; MIPS32-NEXT:    or $3, $2, $1
@@ -308,7 +308,7 @@ define signext i64 @ashr_i64(i64 signext %a, i64 signext %b) {
 ; 32R2-LABEL: ashr_i64:
 ; 32R2:       # %bb.0: # %entry
 ; 32R2-NEXT:    srlv $1, $5, $7
-; 32R2-NEXT:    not $2, $7
+; 32R2-NEXT:    xori $2, $7, 31
 ; 32R2-NEXT:    sll $3, $4, 1
 ; 32R2-NEXT:    sllv $2, $3, $2
 ; 32R2-NEXT:    or $3, $2, $1
@@ -328,7 +328,7 @@ define signext i64 @ashr_i64(i64 signext %a, i64 signext %b) {
 ; 32R6-NEXT:    selnez $6, $6, $3
 ; 32R6-NEXT:    or $2, $6, $2
 ; 32R6-NEXT:    srlv $5, $5, $7
-; 32R6-NEXT:    not $6, $7
+; 32R6-NEXT:    xori $6, $7, 31
 ; 32R6-NEXT:    sll $4, $4, 1
 ; 32R6-NEXT:    sllv $4, $4, $6
 ; 32R6-NEXT:    or $4, $4, $5
@@ -360,9 +360,9 @@ define signext i64 @ashr_i64(i64 signext %a, i64 signext %b) {
 ; MMR3-LABEL: ashr_i64:
 ; MMR3:       # %bb.0: # %entry
 ; MMR3-NEXT:    srlv $2, $5, $7
-; MMR3-NEXT:    not16 $3, $7
-; MMR3-NEXT:    sll16 $5, $4, 1
-; MMR3-NEXT:    sllv $3, $5, $3
+; MMR3-NEXT:    xori $1, $7, 31
+; MMR3-NEXT:    sll16 $3, $4, 1
+; MMR3-NEXT:    sllv $3, $3, $1
 ; MMR3-NEXT:    or16 $3, $2
 ; MMR3-NEXT:    srav $2, $4, $7
 ; MMR3-NEXT:    andi16 $5, $7, 32
@@ -380,7 +380,7 @@ define signext i64 @ashr_i64(i64 signext %a, i64 signext %b) {
 ; MMR6-NEXT:    selnez $6, $6, $3
 ; MMR6-NEXT:    or $2, $6, $2
 ; MMR6-NEXT:    srlv $5, $5, $7
-; MMR6-NEXT:    not16 $6, $7
+; MMR6-NEXT:    xori $6, $7, 31
 ; MMR6-NEXT:    sll16 $4, $4, 1
 ; MMR6-NEXT:    sllv $4, $4, $6
 ; MMR6-NEXT:    or $4, $4, $5
@@ -609,7 +609,7 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS3-NEXT:  # %bb.1: # %entry
 ; MIPS3-NEXT:    dsrlv $1, $5, $7
 ; MIPS3-NEXT:    dsll $4, $4, 1
-; MIPS3-NEXT:    not $2, $2
+; MIPS3-NEXT:    xori $2, $2, 63
 ; MIPS3-NEXT:    dsllv $2, $4, $2
 ; MIPS3-NEXT:    or $1, $2, $1
 ; MIPS3-NEXT:    move $2, $3
@@ -624,7 +624,7 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS64-NEXT:    dsrlv $1, $5, $7
 ; MIPS64-NEXT:    dsll $2, $4, 1
 ; MIPS64-NEXT:    sll $5, $7, 0
-; MIPS64-NEXT:    not $3, $5
+; MIPS64-NEXT:    xori $3, $5, 63
 ; MIPS64-NEXT:    dsllv $2, $2, $3
 ; MIPS64-NEXT:    or $3, $2, $1
 ; MIPS64-NEXT:    dsrav $2, $4, $7
@@ -639,7 +639,7 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS64R2-NEXT:    dsrlv $1, $5, $7
 ; MIPS64R2-NEXT:    dsll $2, $4, 1
 ; MIPS64R2-NEXT:    sll $5, $7, 0
-; MIPS64R2-NEXT:    not $3, $5
+; MIPS64R2-NEXT:    xori $3, $5, 63
 ; MIPS64R2-NEXT:    dsllv $2, $2, $3
 ; MIPS64R2-NEXT:    or $3, $2, $1
 ; MIPS64R2-NEXT:    dsrav $2, $4, $7
@@ -661,7 +661,7 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS64R6-NEXT:    or $2, $8, $2
 ; MIPS64R6-NEXT:    dsrlv $5, $5, $7
 ; MIPS64R6-NEXT:    dsll $4, $4, 1
-; MIPS64R6-NEXT:    not $3, $3
+; MIPS64R6-NEXT:    xori $3, $3, 63
 ; MIPS64R6-NEXT:    dsllv $3, $4, $3
 ; MIPS64R6-NEXT:    or $3, $3, $5
 ; MIPS64R6-NEXT:    seleqz $3, $3, $6
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index c4e05117d28e15e..ddbb1f217837a46 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -283,7 +283,7 @@ define signext i64 @lshr_i64(i64 signext %a, i64 signext %b) {
 ; MIPS2-NEXT:    addiu $2, $zero, 0
 ; MIPS2-NEXT:  # %bb.1: # %entry
 ; MIPS2-NEXT:    srlv $1, $5, $7
-; MIPS2-NEXT:    not $2, $7
+; MIPS2-NEXT:    xori $2, $7, 31
 ; MIPS2-NEXT:    sll $3, $4, 1
 ; MIPS2-NEXT:    sllv $2, $3, $2
 ; MIPS2-NEXT:    or $3, $2, $1
@@ -296,7 +296,7 @@ define signext i64 @lshr_i64(i64 signext %a, i64 signext %b) {
 ; MIPS32-LABEL: lshr_i64:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    srlv $1, $5, $7
-; MIPS32-NEXT:    not $2, $7
+; MIPS32-NEXT:    xori $2, $7, 31
 ; MIPS32-NEXT:    sll $3, $4, 1
 ; MIPS32-NEXT:    sllv $2, $3, $2
 ; MIPS32-NEXT:    or $3, $2, $1
@@ -309,7 +309,7 @@ define signext i64 @lshr_i64(i64 signext %a, i64 signext %b) {
 ; MIPS32R2-LABEL: lshr_i64:
 ; MIPS32R2:       # %bb.0: # %entry
 ; MIPS32R2-NEXT:    srlv $1, $5, $7
-; MIPS32R2-NEXT:    not $2, $7
+; MIPS32R2-NEXT:    xori $2, $7, 31
 ; MIPS32R2-NEXT:    sll $3, $4, 1
 ; MIPS32R2-NEXT:    sllv $2, $3, $2
 ; MIPS32R2-NEXT:    or $3, $2, $1
@@ -322,7 +322,7 @@ define signext i64 @lshr_i64(i64 signext %a, i64 signext %b) {
 ; MIPS32R6-LABEL: lshr_i64:
 ; MIPS32R6:       # %bb.0: # %entry
 ; MIPS32R6-NEXT:    srlv $1, $5, $7
-; MIPS32R6-NEXT:    not $2, $7
+; MIPS32R6-NEXT:    xori $2, $7, 31
 ; MIPS32R6-NEXT:    sll $3, $4, 1
 ; MIPS32R6-NEXT:    sllv $2, $3, $2
 ; MIPS32R6-NEXT:    or $1, $2, $1
@@ -362,9 +362,9 @@ define signext i64 @lshr_i64(i64 signext %a, i64 signext %b) {
 ; MMR3-LABEL: lshr_i64:
 ; MMR3:       # %bb.0: # %entry
 ; MMR3-NEXT:    srlv $2, $5, $7
-; MMR3-NEXT:    not16 $3, $7
-; MMR3-NEXT:    sll16 $5, $4, 1
-; MMR3-NEXT:    sllv $3, $5, $3
+; MMR3-NEXT:    xori $1, $7, 31
+; MMR3-NEXT:    sll16 $3, $4, 1
+; MMR3-NEXT:    sllv $3, $3, $1
 ; MMR3-NEXT:    or16 $3, $2
 ; MMR3-NEXT:    srlv $2, $4, $7
 ; MMR3-NEXT:    andi16 $4, $7, 32
@@ -376,7 +376,7 @@ define signext i64 @lshr_i64(i64 signext %a, i64 signext %b) {
 ; MMR6-LABEL: lshr_i64:
 ; MMR6:       # %bb.0: # %entry
 ; MMR6-NEXT:    srlv $1, $5, $7
-; MMR6-NEXT:    not16 $2, $7
+; MMR6-NEXT:    xori $2, $7, 31
 ; MMR6-NEXT:    sll16 $3, $4, 1
 ; MMR6-NEXT:    sllv $2, $3, $2
 ; MMR6-NEXT:    or $1, $2, $1
@@ -606,7 +606,7 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS3-NEXT:  # %bb.1: # %entry
 ; MIPS3-NEXT:    dsrlv $1, $5, $7
 ; MIPS3-NEXT:    dsll $2, $4, 1
-; MIPS3-NEXT:    not $3, $3
+; MIPS3-NEXT:    xori $3, $3, 63
 ; MIPS3-NEXT:    dsllv $2, $2, $3
 ; MIPS3-NEXT:    or $3, $2, $1
 ; MIPS3-NEXT:    jr $ra
@@ -620,7 +620,7 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS4-NEXT:    dsrlv $1, $5, $7
 ; MIPS4-NEXT:    dsll $2, $4, 1
 ; MIPS4-NEXT:    sll $5, $7, 0
-; MIPS4-NEXT:    not $3, $5
+; MIPS4-NEXT:    xori $3, $5, 63
 ; MIPS4-NEXT:    dsllv $2, $2, $3
 ; MIPS4-NEXT:    or $3, $2, $1
 ; MIPS4-NEXT:    dsrlv $2, $4, $7
@@ -634,7 +634,7 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS64-NEXT:    dsrlv $1, $5, $7
 ; MIPS64-NEXT:    dsll $2, $4, 1
 ; MIPS64-NEXT:    sll $5, $7, 0
-; MIPS64-NEXT:    not $3, $5
+; MIPS64-NEXT:    xori $3, $5, 63
 ; MIPS64-NEXT:    dsllv $2, $2, $3
 ; MIPS64-NEXT:    or $3, $2, $1
 ; MIPS64-NEXT:    dsrlv $2, $4, $7
@@ -648,7 +648,7 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS64R2-NEXT:    dsrlv $1, $5, $7
 ; MIPS64R2-NEXT:    dsll $2, $4, 1
 ; MIPS64R2-NEXT:    sll $5, $7, 0
-; MIPS64R2-NEXT:    not $3, $5
+; MIPS64R2-NEXT:    xori $3, $5, 63
 ; MIPS64R2-NEXT:    dsllv $2, $2, $3
 ; MIPS64R2-NEXT:    or $3, $2, $1
 ; MIPS64R2-NEXT:    dsrlv $2, $4, $7
@@ -662,7 +662,7 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS64R6-NEXT:    dsrlv $1, $5, $7
 ; MIPS64R6-NEXT:    dsll $2, $4, 1
 ; MIPS64R6-NEXT:    sll $3, $7, 0
-; MIPS64R6-NEXT:    not $5, $3
+; MIPS64R6-NEXT:    xori $5, $3, 63
 ; MIPS64R6-NEXT:    dsllv $2, $2, $5
 ; MIPS64R6-NEXT:    or $1, $2, $1
 ; MIPS64R6-NEXT:    andi $2, $3, 64
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index 77f9f0ed646ee2e..256da0b89e6038c 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -343,7 +343,7 @@ define signext i64 @shl_i64(i64 signext %a, i64 signext %b) {
 ; MIPS2-NEXT:    nop
 ; MIPS2-NEXT:  $BB4_3: # %entry
 ; MIPS2-NEXT:    sllv $1, $4, $7
-; MIPS2-NEXT:    not $2, $7
+; MIPS2-NEXT:    xori $2, $7, 31
 ; MIPS2-NEXT:    srl $3, $5, 1
 ; MIPS2-NEXT:    srlv $2, $3, $2
 ; MIPS2-NEXT:    or $2, $1, $2
@@ -356,7 +356,7 @@ define signext i64 @shl_i64(i64 signext %a, i64 signext %b) {
 ; MIPS32-LABEL: shl_i64:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    sllv $1, $4, $7
-; MIPS32-NEXT:    not $2, $7
+; MIPS32-NEXT:    xori $2, $7, 31
 ; MIPS32-NEXT:    srl $3, $5, 1
 ; MIPS32-NEXT:    srlv $2, $3, $2
 ; MIPS32-NEXT:    or $2, $1, $2
@@ -369,7 +369,7 @@ define signext i64 @shl_i64(i64 signext %a, i64 signext %b) {
 ; MIPS32R2-LABEL: shl_i64:
 ; MIPS32R2:       # %bb.0: # %entry
 ; MIPS32R2-NEXT:    sllv $1, $4, $7
-; MIPS32R2-NEXT:    not $2, $7
+; MIPS32R2-NEXT:    xori $2, $7, 31
 ; MIPS32R2-NEXT:    srl $3, $5, 1
 ; MIPS32R2-NEXT:    srlv $2, $3, $2
 ; MIPS32R2-NEXT:    or $2, $1, $2
@@ -382,7 +382,7 @@ define signext i64 @shl_i64(i64 signext %a, i64 signext %b) {
 ; MIPS32R6-LABEL: shl_i64:
 ; MIPS32R6:       # %bb.0: # %entry
 ; MIPS32R6-NEXT:    sllv $1, $4, $7
-; MIPS32R6-NEXT:    not $2, $7
+; MIPS32R6-NEXT:    xori $2, $7, 31
 ; MIPS32R6-NEXT:    srl $3, $5, 1
 ; MIPS32R6-NEXT:    srlv $2, $3, $2
 ; MIPS32R6-NEXT:    or $1, $1, $2
@@ -422,9 +422,9 @@ define signext i64 @shl_i64(i64 signext %a, i64 signext %b) {
 ; MMR3-LABEL: shl_i64:
 ; MMR3:       # %bb.0: # %entry
 ; MMR3-NEXT:    sllv $3, $4, $7
-; MMR3-NEXT:    not16 $2, $7
-; MMR3-NEXT:    srl16 $4, $5, 1
-; MMR3-NEXT:    srlv $2, $4, $2
+; MMR3-NEXT:    xori $1, $7, 31
+; MMR3-NEXT:    srl16 $2, $5, 1
+; MMR3-NEXT:    srlv $2, $2, $1
 ; MMR3-NEXT:    or16 $2, $3
 ; MMR3-NEXT:    sllv $3, $5, $7
 ; MMR3-NEXT:    andi16 $4, $7, 32
@@ -436,7 +436,7 @@ define signext i64 @shl_i64(i64 signext %a, i64 signext %b) {
 ; MMR6-LABEL: shl_i64:
 ; MMR6:       # %bb.0: # %entry
 ; MMR6-NEXT:    sllv $1, $4, $7
-; MMR6-NEXT:    not16 $2, $7
+; MMR6-NEXT:    xori $2, $7, 31
 ; MMR6-NEXT:    srl16 $3, $5, 1
 ; MMR6-NEXT:    srlv $2, $3, $2
 ; MMR6-NEXT:    or $1, $1, $2
@@ -668,7 +668,7 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS3-NEXT:  .LBB5_3: # %entry
 ; MIPS3-NEXT:    dsllv $1, $4, $7
 ; MIPS3-NEXT:    dsrl $2, $5, 1
-; MIPS3-NEXT:    not $3, $3
+; MIPS3-NEXT:    xori $3, $3, 63
 ; MIPS3-NEXT:    dsrlv $2, $2, $3
 ; MIPS3-NEXT:    or $2, $1, $2
 ; MIPS3-NEXT:    bnez $8, .LBB5_2
@@ -682,7 +682,7 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS4-NEXT:    dsllv $1, $4, $7
 ; MIPS4-NEXT:    dsrl $2, $5, 1
 ; MIPS4-NEXT:    sll $4, $7, 0
-; MIPS4-NEXT:    not $3, $4
+; MIPS4-NEXT:    xori $3, $4, 63
 ; MIPS4-NEXT:    dsrlv $2, $2, $3
 ; MIPS4-NEXT:    or $2, $1, $2
 ; MIPS4-NEXT:    dsllv $3, $5, $7
@@ -696,7 +696,7 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS64-NEXT:    dsllv $1, $4, $7
 ; MIPS64-NEXT:    dsrl $2, $5, 1
 ; MIPS64-NEXT:    sll $4, $7, 0
-; MIPS64-NEXT:    not $3, $4
+; MIPS64-NEXT:    xori $3, $4, 63
 ; MIPS64-NEXT:    dsrlv $2, $2, $3
 ; MIPS64-NEXT:    or $2, $1, $2
 ; MIPS64-NEXT:    dsllv $3, $5, $7
@@ -710,7 +710,7 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS64R2-NEXT:    dsllv $1, $4, $7
 ; MIPS64R2-NEXT:    dsrl $2, $5, 1
 ; MIPS64R2-NEXT:    sll $4, $7, 0
-; MIPS64R2-NEXT:    not $3, $4
+; MIPS64R2-NEXT:    xori $3, $4, 63
 ; MIPS64R2-NEXT:    dsrlv $2, $2, $3
 ; MIPS64R2-NEXT:    or $2, $1, $2
 ; MIPS64R2-NEXT:    dsllv $3, $5, $7
@@ -724,7 +724,7 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS64R6-NEXT:    dsllv $1, $4, $7
 ; MIPS64R6-NEXT:    dsrl $2, $5, 1
 ; MIPS64R6-NEXT:    sll $3, $7, 0
-; MIPS64R6-NEXT:    not $4, $3
+; MIPS64R6-NEXT:    xori $4, $3, 63
 ; MIPS64R6-NEXT:    dsrlv $2, $2, $4
 ; MIPS64R6-NEXT:    or $1, $1, $2
 ; MIPS64R6-NEXT:    andi $2, $3, 64

>From 7ca0f4418a6d385e07d9aff42865c34d3dc2adf7 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 3 Nov 2023 09:59:25 -0700
Subject: [PATCH 48/76] [indvars] Add tests for countdown style loops
 w/nonnegative IVs

Adding test coverage in advance of upcoming changes.  Note that these
tests specifically use unsigned comparisons for the backends, the
signed versions are fairly well handled by existing logic.
---
 .../widen-nonnegative-countdown.ll            | 891 ++++++++++++++++++
 1 file changed, 891 insertions(+)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll

diff --git a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll
new file mode 100644
index 000000000000000..b0392cc6fe2c1ce
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll
@@ -0,0 +1,891 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -passes='indvars' -verify-loop-info -verify-dom-info -verify-scev | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; This file covers proving non-negative IVs for IV widening on common
+; count down loop structures.  Count down loops are tricky to prove
+; non-negative for since we canonicalize a sub to add, and loose the
+; sub nuw fact.
+
+; use(a[i]) loops
+; -------------------------------------------------------------------
+
+define void @zext_postinc_constant_start(ptr %A) {
+; CHECK-LABEL: @zext_postinc_constant_start(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV_NEXT]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %idxprom.us = zext i32 %j.016.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @zext_preinc_constant_start(ptr %A) {
+; CHECK-LABEL: @zext_preinc_constant_start(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %idxprom.us = zext i32 %j.016.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @zext_postinc(ptr %A, i32 %start) {
+; CHECK-LABEL: @zext_postinc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[J_016_US:%.*]] = phi i32 [ [[INC_US:%.*]], [[FOR_BODY]] ], [ [[START]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INC_US]] = add nsw i32 [[J_016_US]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[INC_US]], 6
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %idxprom.us = zext i32 %j.016.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @zext_preinc(ptr %A, i32 %start) {
+; CHECK-LABEL: @zext_preinc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %idxprom.us = zext i32 %j.016.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @sext_postinc_constant_start(ptr %A) {
+; CHECK-LABEL: @sext_postinc_constant_start(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV_NEXT]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %idxprom.us = sext i32 %j.016.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @sext_preinc_constant_start(ptr %A) {
+; CHECK-LABEL: @sext_preinc_constant_start(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %idxprom.us = sext i32 %j.016.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @sext_postinc(ptr %A, i32 %start) {
+; CHECK-LABEL: @sext_postinc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP1]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %idxprom.us = sext i32 %j.016.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @sext_preinc(ptr %A, i32 %start) {
+; CHECK-LABEL: @sext_preinc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP1]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %idxprom.us = sext i32 %j.016.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+; use(a[i + 1]) loops
+; -------------------------------------------------------------------
+
+define void @zext_postinc_constant_start_offset_constant_one(ptr %A) {
+; CHECK-LABEL: @zext_postinc_constant_start_offset_constant_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV_NEXT]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, 1
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @zext_preinc_constant_start_offset_constant_one(ptr %A) {
+; CHECK-LABEL: @zext_preinc_constant_start_offset_constant_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, 1
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @zext_postinc_offset_constant_one(ptr %A, i32 %start) {
+; CHECK-LABEL: @zext_postinc_offset_constant_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[J_016_US:%.*]] = phi i32 [ [[INC_US:%.*]], [[FOR_BODY]] ], [ [[START]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INC_US]] = add nsw i32 [[J_016_US]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[INC_US]], 6
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, 1
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @zext_preinc_offset_constant_one(ptr %A, i32 %start) {
+; CHECK-LABEL: @zext_preinc_offset_constant_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, 1
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @sext_postinc_constant_start_offset_constant_one(ptr %A) {
+; CHECK-LABEL: @sext_postinc_constant_start_offset_constant_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV_NEXT]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, 1
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @sext_preinc_constant_start_offset_constant_one(ptr %A) {
+; CHECK-LABEL: @sext_preinc_constant_start_offset_constant_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, 1
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @sext_postinc_offset_constant_one(ptr %A, i32 %start) {
+; CHECK-LABEL: @sext_postinc_offset_constant_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, 1
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @sext_preinc_offset_constant_one(ptr %A, i32 %start) {
+; CHECK-LABEL: @sext_preinc_offset_constant_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, 1
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+; use(a[i - 1]) loops
+; -------------------------------------------------------------------
+
+define void @zext_postinc_constant_start_offset_constant_minus_one(ptr %A) {
+; CHECK-LABEL: @zext_postinc_constant_start_offset_constant_minus_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV_NEXT]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, -1
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @zext_preinc_constant_start_offset_constant_minus_one(ptr %A) {
+; CHECK-LABEL: @zext_preinc_constant_start_offset_constant_minus_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, -1
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @zext_postinc_offset_constant_minus_one(ptr %A, i32 %start) {
+; CHECK-LABEL: @zext_postinc_offset_constant_minus_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[J_016_US:%.*]] = phi i32 [ [[INC_US:%.*]], [[FOR_BODY]] ], [ [[START]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[J_016_US]], -1
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INC_US]] = add nsw i32 [[J_016_US]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[INC_US]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, -1
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @zext_preinc_offset_constant_minus_one(ptr %A, i32 %start) {
+; CHECK-LABEL: @zext_preinc_offset_constant_minus_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[J_016_US:%.*]] = phi i32 [ [[INC_US:%.*]], [[FOR_BODY]] ], [ [[START]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[J_016_US]], -1
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INC_US]] = add nsw i32 [[J_016_US]], -1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, -1
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @sext_postinc_constant_start_offset_constant_minus_one(ptr %A) {
+; CHECK-LABEL: @sext_postinc_constant_start_offset_constant_minus_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV_NEXT]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, -1
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @sext_preinc_constant_start_offset_constant_minus_one(ptr %A) {
+; CHECK-LABEL: @sext_preinc_constant_start_offset_constant_minus_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ 1024, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, -1
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @sext_postinc_offset_constant_minus_one(ptr %A, i32 %start) {
+; CHECK-LABEL: @sext_postinc_offset_constant_minus_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], -1
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, -1
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %inc.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @sext_preinc_offset_constant_minus_one(ptr %A, i32 %start) {
+; CHECK-LABEL: @sext_preinc_offset_constant_minus_one(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONPOS:%.*]] = icmp slt i32 [[START:%.*]], 2
+; CHECK-NEXT:    br i1 [[NONPOS]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[START]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], -1
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %nonpos = icmp slt i32 %start, 2
+  br i1 %nonpos, label %exit, label %for.body
+
+for.body:
+  %j.016.us = phi i32 [ %start, %entry ], [ %inc.us, %for.body ]
+  %add.us = add i32 %j.016.us, -1
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, ptr %A, i64 %idxprom.us
+  tail call void @use_ptr(ptr %arrayidx.us)
+  %inc.us = add nsw i32 %j.016.us, -1
+  %cmp2.us = icmp ugt i32 %j.016.us, 6
+  br i1 %cmp2.us, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+declare dso_local void @use_ptr(ptr %0)
+
+

>From 1ffea97ffdbe4148db7c6db4638d2ec56234c502 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 3 Nov 2023 10:21:30 -0700
Subject: [PATCH 49/76] [indvars] Support known positive extends in
 getExtendedOperandRecurrence (#70990)

IndVars has the existing notion of a narrow definition which is known to
positive and thus both sign and zero extension kinds are actually the
same operations. There's existing logic for forming a SCEV based on the
extension kind and the no-wrap flags. This change extends that logic to
form the opposite extension kind for a positive def if doing so is
allowed by the flags. Note that we already do something analogous for
the getWideRecurrence case as well.
---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp  | 17 +++++++-
 .../promote-iv-to-eliminate-casts.ll          |  6 +--
 .../IndVarSimplify/widen-nonnegative.ll       | 42 +++++++++++--------
 llvm/test/Transforms/LoopFlatten/widen-iv.ll  | 26 +++++-------
 4 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index de2556f3cec19c9..9b91d74b1d2fac5 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1393,7 +1393,22 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
   else if (ExtKind == ExtendKind::Zero && OBO->hasNoUnsignedWrap())
     ExtendOperExpr = SE->getZeroExtendExpr(
       SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else
+  else if (DU.NeverNegative) {
+    // For a non-negative NarrowDef, we can choose either type of
+    // extension.  We want to use the current extend kind if legal
+    // (see above), and we only hit this code if we need to check
+    // the opposite case.
+    if (OBO->hasNoSignedWrap()) {
+      ExtKind = ExtendKind::Sign;
+      ExtendOperExpr = SE->getSignExtendExpr(
+        SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+    } else if (OBO->hasNoUnsignedWrap()) {
+      ExtKind = ExtendKind::Zero;
+      ExtendOperExpr = SE->getZeroExtendExpr(
+        SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+    } else
+      return {nullptr, ExtendKind::Unknown};
+  } else
     return {nullptr, ExtendKind::Unknown};
 
   // When creating this SCEV expr, don't apply the current operations NSW or NUW
diff --git a/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll b/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll
index d912540c95657f3..60e014b0efca53a 100644
--- a/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll
+++ b/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll
@@ -196,8 +196,8 @@ define void @promote_latch_condition_decrementing_loop_01(ptr %p, ptr %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ [[TMP0]], [[PREHEADER]] ]
 ; CHECK-NEXT:    [[EL:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store atomic i32 0, ptr [[EL]] unordered, align 4
-; CHECK-NEXT:    [[LOOPCOND:%.*]] = icmp slt i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[LOOPCOND:%.*]] = icmp slt i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br i1 [[LOOPCOND]], label [[LOOPEXIT_LOOPEXIT:%.*]], label [[LOOP]]
 ;
 
@@ -336,8 +336,8 @@ define void @promote_latch_condition_decrementing_loop_04(ptr %p, ptr %a, i1 %co
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ [[TMP0]], [[PREHEADER]] ]
 ; CHECK-NEXT:    [[EL:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store atomic i32 0, ptr [[EL]] unordered, align 4
-; CHECK-NEXT:    [[LOOPCOND:%.*]] = icmp slt i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[LOOPCOND:%.*]] = icmp slt i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br i1 [[LOOPCOND]], label [[LOOPEXIT_LOOPEXIT:%.*]], label [[LOOP]]
 ;
 
@@ -398,8 +398,8 @@ define void @promote_latch_condition_decrementing_loop_05(ptr %p, ptr %a, i1 %co
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ [[TMP0]], [[PREHEADER]] ]
 ; CHECK-NEXT:    [[EL:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store atomic i32 0, ptr [[EL]] unordered, align 4
-; CHECK-NEXT:    [[LOOPCOND:%.*]] = icmp slt i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[LOOPCOND:%.*]] = icmp slt i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br i1 [[LOOPCOND]], label [[LOOPEXIT_LOOPEXIT:%.*]], label [[LOOP]]
 ;
 
diff --git a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
index e9b1de699196664..612e9452c6373dc 100644
--- a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
+++ b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
@@ -143,14 +143,15 @@ exit:
 define void @sext_add_nuw(ptr %A, i32 %offset, i32 %M) {
 ; CHECK-LABEL: @sext_add_nuw(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[OFFSET:%.*]] to i64
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[M:%.*]], i32 1)
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[ADD_US:%.*]] = add nuw i32 [[TMP0]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -215,14 +216,15 @@ exit:
 define void @zext_add_nsw(ptr %A, i32 %offset, i32 %M) {
 ; CHECK-LABEL: @zext_add_nsw(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[OFFSET:%.*]] to i64
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[M:%.*]], i32 1)
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[TMP0]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -322,14 +324,15 @@ exit:
 define void @zext_nneg_add_nsw(ptr %A, i32 %offset, i32 %M) {
 ; CHECK-LABEL: @zext_nneg_add_nsw(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[OFFSET:%.*]] to i64
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[M:%.*]], i32 1)
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[TMP0]], [[OFFSET:%.*]]
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -467,14 +470,15 @@ exit:
 define void @sext_mul_nuw(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK-LABEL: @sext_mul_nuw(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[MULTIPLE:%.*]] to i64
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[M:%.*]], i32 1)
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul nuw i32 [[TMP0]], [[MULTIPLE:%.*]]
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[MUL_US]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -539,14 +543,15 @@ exit:
 define void @zext_mul_nsw(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK-LABEL: @zext_mul_nsw(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[MULTIPLE:%.*]] to i64
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[M:%.*]], i32 1)
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[TMP0]], [[MULTIPLE:%.*]]
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[MUL_US]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -646,14 +651,15 @@ exit:
 define void @zext_nneg_mul_nsw(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK-LABEL: @zext_nneg_mul_nsw(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[MULTIPLE:%.*]] to i64
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[M:%.*]], i32 1)
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SMAX]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[TMP0]], [[MULTIPLE:%.*]]
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[MUL_US]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv.ll b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
index 4692fa829bac5c7..4b46dcde3b04d2e 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 
-; RUN: opt < %s -S -passes='loop-simplify,loop(loop-flatten),dce,verify' -loop-flatten-widen-iv=true \
+; RUN: opt < %s -S -passes='loop-simplify,loop(loop-flatten),adce,verify' -loop-flatten-widen-iv=true \
 ; RUN:     -verify-loop-info -verify-dom-info -verify-scev \
 ; RUN:     -loop-flatten-cost-threshold=6 | \
 ; RUN:     FileCheck %s --check-prefix=CHECK
 
-; RUN: opt < %s -S -passes='loop-simplify,loop(loop-flatten),dce,verify' -loop-flatten-widen-iv=false \
+; RUN: opt < %s -S -passes='loop-simplify,loop(loop-flatten),adce,verify' -loop-flatten-widen-iv=false \
 ; RUN:     -verify-loop-info -verify-dom-info -verify-scev | \
 ; RUN:     FileCheck %s --check-prefix=DONTWIDEN
 
@@ -29,17 +29,15 @@ define void @foo(ptr %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
-; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR1]] to i32
+; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[FLATTEN_TRUNCIV]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVAR2]]
 ; CHECK-NEXT:    tail call void @f(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1
-; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT2]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -138,7 +136,6 @@ define void @foo2_sext(i32* nocapture readonly %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[I_018_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVAR2]]
@@ -147,7 +144,6 @@ define void @foo2_sext(i32* nocapture readonly %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
-; CHECK-NEXT:    [[INC6_US]] = add nuw nsw i32 [[I_018_US]], 1
 ; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond1.preheader:
@@ -1002,17 +998,15 @@ define void @foo_M_sext(ptr %A, i32 %N, i16 %M) {
 ; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
-; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR1]] to i32
+; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[FLATTEN_TRUNCIV]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVAR2]]
 ; CHECK-NEXT:    tail call void @f(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
-; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1
-; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT2]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]

>From 6b29279a49ceb2d256da8f57480f48ebc4e8cee1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 3 Nov 2023 17:26:32 +0000
Subject: [PATCH 50/76] [X86] X86ISelLowering.cpp - use
 StringRef::starts_with/ends_with instead of startswith/endswith. NFC.

startswith/endswith wrap starts_with/ends_with and will eventually go away (to more closely match string_view)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b1b3c5c82292f70..ec72cee0c392d50 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31782,7 +31782,7 @@ static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
   for (auto &AsmStr : AsmStrs) {
     // Match the OpNo string. We should match exactly to exclude match
     // sub-string, e.g. "$12" contain "$1"
-    if (AsmStr.endswith(OpNoStr1))
+    if (AsmStr.ends_with(OpNoStr1))
       I = AsmStr.size() - OpNoStr1.size();
 
     // Get the index of operand in AsmStr.
@@ -56317,7 +56317,7 @@ static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
 
   for (StringRef Piece : Pieces) {
-    if (!S.startswith(Piece)) // Check if the piece matches.
+    if (!S.starts_with(Piece)) // Check if the piece matches.
       return false;
 
     S = S.substr(Piece.size());
@@ -57456,7 +57456,7 @@ X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
     assert(Target.isSymbol() && "Unexpected target operand for a direct call");
     // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
     // 64-bit indirect thunk calls.
-    assert(StringRef(Target.getSymbolName()).endswith("_r11") &&
+    assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
            "Unexpected register for an indirect thunk call");
     TargetReg = X86::R11;
     break;

>From d3e7a48cbde060a6dbc1edcb00f375fb2f9405dc Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 3 Nov 2023 10:00:41 -0700
Subject: [PATCH 51/76] [OpenMP][NFC] Remove a no-op function

---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp             |  7 -------
 .../nvptx_target_parallel_reduction_codegen.cpp      |  9 ---------
 ...arget_parallel_reduction_codegen_tbaa_PR46146.cpp |  2 --
 clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp  | 12 ------------
 clang/test/OpenMP/reduction_implicit_map.cpp         |  1 -
 .../OpenMP/target_teams_generic_loop_codegen.cpp     |  4 ----
 llvm/include/llvm/Frontend/OpenMP/OMPKinds.def       |  3 ---
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp                |  1 -
 llvm/test/Transforms/OpenMP/add_attributes.ll        | 11 -----------
 openmp/libomptarget/DeviceRTL/include/Interface.h    |  4 ----
 openmp/libomptarget/DeviceRTL/src/Reduction.cpp      |  4 ----
 11 files changed, 58 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 0ed665e0dfb9722..009b3f0a85a3785 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -3081,14 +3081,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
       ++IRHS;
     }
   };
-  llvm::Value *EndArgs[] = {ThreadId};
   RegionCodeGenTy RCG(CodeGen);
-  NVPTXActionTy Action(
-      nullptr, std::nullopt,
-      OMPBuilder.getOrCreateRuntimeFunction(
-          CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
-      EndArgs);
-  RCG.setAction(Action);
   RCG(CGF);
   // There is no need to emit line number for unconditional branch.
   (void)ApplyDebugLocation::CreateEmpty(CGF);
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index 094c5ae3522f96d..c2a958dfdd2453e 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -148,7 +148,6 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
 // CHECK-64-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK-64-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
-// CHECK-64-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK-64-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK-64:       .omp.reduction.done:
 // CHECK-64-NEXT:    ret void
@@ -353,7 +352,6 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
 // CHECK-64-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK-64-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
-// CHECK-64-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK-64-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK-64:       .omp.reduction.done:
 // CHECK-64-NEXT:    ret void
@@ -609,7 +607,6 @@ int bar(int n){
 // CHECK-64:       cond.end11:
 // CHECK-64-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE9]] ], [ [[TMP16]], [[COND_FALSE10]] ]
 // CHECK-64-NEXT:    store i16 [[COND12]], ptr [[TMP1]], align 2
-// CHECK-64-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
 // CHECK-64-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK-64:       .omp.reduction.done:
 // CHECK-64-NEXT:    ret void
@@ -824,7 +821,6 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
 // CHECK-32-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK-32-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
-// CHECK-32-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK-32-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK-32:       .omp.reduction.done:
 // CHECK-32-NEXT:    ret void
@@ -1029,7 +1025,6 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
 // CHECK-32-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK-32-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
-// CHECK-32-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK-32-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK-32:       .omp.reduction.done:
 // CHECK-32-NEXT:    ret void
@@ -1285,7 +1280,6 @@ int bar(int n){
 // CHECK-32:       cond.end11:
 // CHECK-32-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE9]] ], [ [[TMP16]], [[COND_FALSE10]] ]
 // CHECK-32-NEXT:    store i16 [[COND12]], ptr [[TMP1]], align 2
-// CHECK-32-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
 // CHECK-32-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK-32:       .omp.reduction.done:
 // CHECK-32-NEXT:    ret void
@@ -1500,7 +1494,6 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK-32-EX-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
-// CHECK-32-EX-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK-32-EX-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK-32-EX:       .omp.reduction.done:
 // CHECK-32-EX-NEXT:    ret void
@@ -1705,7 +1698,6 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
 // CHECK-32-EX-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK-32-EX-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
-// CHECK-32-EX-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK-32-EX-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK-32-EX:       .omp.reduction.done:
 // CHECK-32-EX-NEXT:    ret void
@@ -1961,7 +1953,6 @@ int bar(int n){
 // CHECK-32-EX:       cond.end11:
 // CHECK-32-EX-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE9]] ], [ [[TMP16]], [[COND_FALSE10]] ]
 // CHECK-32-EX-NEXT:    store i16 [[COND12]], ptr [[TMP1]], align 2
-// CHECK-32-EX-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
 // CHECK-32-EX-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK-32-EX:       .omp.reduction.done:
 // CHECK-32-EX-NEXT:    ret void
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index 5e91833c9be8723..d4bb6c1639e4609 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -338,7 +338,6 @@ void test() {
 // CHECK1-NEXT:    br i1 [[TMP37]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
 // CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[TMP2]], ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]]) #[[ATTR12]]
-// CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP34]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
@@ -832,7 +831,6 @@ void test() {
 // CHECK1-NEXT:    br i1 [[TMP37]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
 // CHECK1-NEXT:    [[CALL21:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[TMP2]], ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]]) #[[ATTR12]]
-// CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP34]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index 137ef3861751bb8..32b67762a1e1e6b 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -106,7 +106,6 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
 // CHECK1-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK1-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
-// CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
 // CHECK1-NEXT:    call void @__kmpc_free_shared(ptr [[E1]], i64 8)
@@ -402,7 +401,6 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK1-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
-// CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
 // CHECK1-NEXT:    call void @__kmpc_free_shared(ptr [[D2]], i64 4)
@@ -751,7 +749,6 @@ int bar(int n){
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
 // CHECK1-NEXT:    store i16 [[COND]], ptr [[TMP1]], align 2
-// CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
 // CHECK1-NEXT:    ret void
@@ -821,7 +818,6 @@ int bar(int n){
 // CHECK1:       cond.end11:
 // CHECK1-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE9]] ], [ [[TMP16]], [[COND_FALSE10]] ]
 // CHECK1-NEXT:    store i16 [[COND12]], ptr [[TMP1]], align 2
-// CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK1:       .omp.reduction.done:
 // CHECK1-NEXT:    ret void
@@ -1303,7 +1299,6 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
 // CHECK2-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK2-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
-// CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK2:       .omp.reduction.done:
 // CHECK2-NEXT:    call void @__kmpc_free_shared(ptr [[E1]], i32 8)
@@ -1599,7 +1594,6 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
 // CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK2-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
-// CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK2:       .omp.reduction.done:
 // CHECK2-NEXT:    call void @__kmpc_free_shared(ptr [[D2]], i32 4)
@@ -1948,7 +1942,6 @@ int bar(int n){
 // CHECK2:       cond.end:
 // CHECK2-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
 // CHECK2-NEXT:    store i16 [[COND]], ptr [[TMP1]], align 2
-// CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK2:       .omp.reduction.done:
 // CHECK2-NEXT:    ret void
@@ -2018,7 +2011,6 @@ int bar(int n){
 // CHECK2:       cond.end11:
 // CHECK2-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE9]] ], [ [[TMP16]], [[COND_FALSE10]] ]
 // CHECK2-NEXT:    store i16 [[COND12]], ptr [[TMP1]], align 2
-// CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK2:       .omp.reduction.done:
 // CHECK2-NEXT:    ret void
@@ -2500,7 +2492,6 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
 // CHECK3-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK3-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
-// CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK3:       .omp.reduction.done:
 // CHECK3-NEXT:    call void @__kmpc_free_shared(ptr [[E1]], i32 8)
@@ -2796,7 +2787,6 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
 // CHECK3-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK3-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
-// CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK3:       .omp.reduction.done:
 // CHECK3-NEXT:    call void @__kmpc_free_shared(ptr [[D2]], i32 4)
@@ -3145,7 +3135,6 @@ int bar(int n){
 // CHECK3:       cond.end:
 // CHECK3-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
 // CHECK3-NEXT:    store i16 [[COND]], ptr [[TMP1]], align 2
-// CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK3:       .omp.reduction.done:
 // CHECK3-NEXT:    ret void
@@ -3215,7 +3204,6 @@ int bar(int n){
 // CHECK3:       cond.end11:
 // CHECK3-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE9]] ], [ [[TMP16]], [[COND_FALSE10]] ]
 // CHECK3-NEXT:    store i16 [[COND12]], ptr [[TMP1]], align 2
-// CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK3:       .omp.reduction.done:
 // CHECK3-NEXT:    ret void
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index 03864f6215bdd1a..58a7c7252848db5 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -158,7 +158,6 @@ int main()
 // CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[E2]], align 8
 // CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP14]], [[TMP15]]
 // CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX]], align 8
-// CHECK-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP10]])
 // CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK:       .omp.reduction.done:
 // CHECK-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index f9aef3acb1c611e..06e8778988387ac 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -614,7 +614,6 @@ int foo() {
 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK:       omp.arraycpy.done17:
-// CHECK-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])
 // CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK:       .omp.reduction.done:
 // CHECK-NEXT:    ret void
@@ -759,7 +758,6 @@ int foo() {
 // CHECK-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
 // CHECK-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK:       omp.arraycpy.done19:
-// CHECK-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP21]])
 // CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // CHECK:       .omp.reduction.done:
 // CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
@@ -1348,7 +1346,6 @@ int foo() {
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP42]]
 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
 // IR-GPU:       omp.arraycpy.done17:
-// IR-GPU-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])
 // IR-GPU-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // IR-GPU:       .omp.reduction.done:
 // IR-GPU-NEXT:    ret void
@@ -1495,7 +1492,6 @@ int foo() {
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
 // IR-GPU:       omp.arraycpy.done19:
-// IR-GPU-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP21]])
 // IR-GPU-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
 // IR-GPU:       .omp.reduction.done:
 // IR-GPU-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 9ee1cdd5313fe8e..5215a5a97a0cbdc 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -471,7 +471,6 @@ __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16)
 __OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
           Int32, SizeTy, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr)
-__OMP_RTL(__kmpc_nvptx_end_reduce_nowait, false, Void, Int32)
 __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
           VoidPtr, Int32, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr,
           GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)
@@ -1042,8 +1041,6 @@ __OMP_RTL_ATTRS(__kmpc_shuffle_int32, AttributeSet(), SExt,
                 ParamAttrs(SExt, SExt, SExt))
 __OMP_RTL_ATTRS(__kmpc_nvptx_parallel_reduce_nowait_v2, AttributeSet(), SExt,
                 ParamAttrs(AttributeSet(), SExt, SExt, SizeTyExt))
-__OMP_RTL_ATTRS(__kmpc_nvptx_end_reduce_nowait, AttributeSet(), AttributeSet(),
-                ParamAttrs(SExt))
 __OMP_RTL_ATTRS(__kmpc_nvptx_teams_reduce_nowait_v2, AttributeSet(), SExt,
                 ParamAttrs(AttributeSet(), SExt, AttributeSet(), ZExt))
 __OMP_RTL_ATTRS(__kmpc_reduction_get_fixed_buffer, GetterAttrs, AttributeSet(), ParamAttrs())
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 74ebbcde5729247..fe250047759cdcc 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -4940,7 +4940,6 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       case OMPRTL___kmpc_barrier:
       case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
       case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
-      case OMPRTL___kmpc_nvptx_end_reduce_nowait:
       case OMPRTL___kmpc_error:
       case OMPRTL___kmpc_flush:
       case OMPRTL___kmpc_get_hardware_thread_id_in_block:
diff --git a/llvm/test/Transforms/OpenMP/add_attributes.ll b/llvm/test/Transforms/OpenMP/add_attributes.ll
index 2c6ce204dc9e4e5..18271b7646bc97d 100644
--- a/llvm/test/Transforms/OpenMP/add_attributes.ll
+++ b/llvm/test/Transforms/OpenMP/add_attributes.ll
@@ -722,8 +722,6 @@ declare void @__kmpc_kernel_prepare_parallel(ptr);
 
 declare i32 @__kmpc_masked(ptr, i32, i32);
 
-declare void @__kmpc_nvptx_end_reduce_nowait(i32);
-
 declare i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr, i32, i32, i64, ptr, ptr, ptr);
 
 declare i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr, i32, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr);
@@ -1367,9 +1365,6 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; CHECK: ; Function Attrs: nounwind
 ; CHECK: declare i32 @__kmpc_masked(ptr, i32, i32)
 
-; CHECK-NOT: Function Attrs
-; CHECK: declare void @__kmpc_nvptx_end_reduce_nowait(i32)
-
 ; CHECK-NOT: Function Attrs
 ; CHECK: declare i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr, i32, i32, i64, ptr, ptr, ptr)
 
@@ -2015,9 +2010,6 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; OPTIMISTIC: ; Function Attrs: nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite)
 ; OPTIMISTIC: declare i32 @__kmpc_masked(ptr nocapture nofree readonly, i32, i32)
 
-; OPTIMISTIC-NOT: Function Attrs
-; OPTIMISTIC: declare void @__kmpc_nvptx_end_reduce_nowait(i32)
-
 ; OPTIMISTIC-NOT: Function Attrs
 ; OPTIMISTIC: declare i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr, i32, i32, i64, ptr, ptr, ptr)
 
@@ -2676,9 +2668,6 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; EXT: ; Function Attrs: nounwind
 ; EXT: declare signext i32 @__kmpc_masked(ptr, i32 signext, i32 signext)
 
-; EXT-NOT: Function Attrs
-; EXT: declare void @__kmpc_nvptx_end_reduce_nowait(i32 signext)
-
 ; EXT-NOT: Function Attrs
 ; EXT: declare signext i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr, i32 signext, i32 signext, i64, ptr, ptr, ptr)
 
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index a603e91d1182d41..24de620759c4194 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -230,10 +230,6 @@ void __kmpc_target_deinit();
 /// Reduction
 ///
 ///{
-void __kmpc_nvptx_end_reduce(int32_t TId);
-
-void __kmpc_nvptx_end_reduce_nowait(int32_t TId);
-
 void *__kmpc_reduction_get_fixed_buffer();
 
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index 49687f365b9228f..29a484aa0eb247e 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -335,10 +335,6 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 
   return 0;
 }
-
-void __kmpc_nvptx_end_reduce(int32_t TId) {}
-
-void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
 }
 
 void *__kmpc_reduction_get_fixed_buffer() {

>From b863fcb6d4514ca61f7cfe5f6750b8ce06659eae Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin at accesssoftek.com>
Date: Sat, 4 Nov 2023 00:40:16 +0700
Subject: [PATCH 52/76] [OptBisect] Remove an unused method declaration. NFC
 (#71145)

The body of `OptBisect::checkPass()` was moved to
`OptBisect::shouldRunPass()` in
[D137149](https://reviews.llvm.org/D137149).
---
 llvm/include/llvm/IR/OptBisect.h | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/IR/OptBisect.h b/llvm/include/llvm/IR/OptBisect.h
index 2987e5ad90c4d3c..507d415d5e112bb 100644
--- a/llvm/include/llvm/IR/OptBisect.h
+++ b/llvm/include/llvm/IR/OptBisect.h
@@ -53,7 +53,14 @@ class OptBisect : public OptPassGate {
 
   /// Checks the bisect limit to determine if the specified pass should run.
   ///
-  /// This forwards to checkPass().
+  /// The method prints the name of the pass, its assigned bisect number, and
+  /// whether or not the pass will be executed. It returns true if the pass
+  /// should run, i.e. if the bisect limit is set to -1 or has not yet been
+  /// exceeded.
+  ///
+  /// Most passes should not call this routine directly. Instead, it is called
+  /// through helper routines provided by the base classes of the pass. For
+  /// instance, function passes should call FunctionPass::skipFunction().
   bool shouldRunPass(const StringRef PassName,
                      StringRef IRDescription) override;
 
@@ -67,19 +74,6 @@ class OptBisect : public OptPassGate {
     LastBisectNum = 0;
   }
 
-  /// Checks the bisect limit to determine if the specified pass should run.
-  ///
-  /// If the bisect limit is set to -1, the function prints a message describing
-  /// the pass and the bisect number assigned to it and return true.  Otherwise,
-  /// the function prints a message with the bisect number assigned to the
-  /// pass and indicating whether or not the pass will be run and return true if
-  /// the bisect limit has not yet been exceeded or false if it has.
-  ///
-  /// Most passes should not call this routine directly. Instead, they are
-  /// called through helper routines provided by the pass base classes.  For
-  /// instance, function passes should call FunctionPass::skipFunction().
-  bool checkPass(const StringRef PassName, const StringRef TargetDesc);
-
   static const int Disabled = std::numeric_limits<int>::max();
 
 private:

>From 9dfdbd788707edc8c39eb2bff16004aba1f3586b Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Tue, 18 Apr 2023 04:16:58 -0700
Subject: [PATCH 53/76] [SLP]Improve tryToGatherExtractElements by using
 per-register analysis.

Currently tryToGatherExtractElements function analyzes the whole vector,
regrdless number of actual registers, used in this vector. It may
prevent some optimizations, because per-register analysis may allow to
simplify the final code by reusing more already emitted vectors and
better shuffles.

Differential Revision: https://reviews.llvm.org/D148855
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 534 ++++++++++--------
 .../AArch64/extractelements-to-shuffle.ll     | 135 ++---
 .../SLPVectorizer/X86/crash_clear_undefs.ll   |   2 +-
 .../SLPVectorizer/X86/hadd-inseltpoison.ll    | 152 ++++-
 .../test/Transforms/SLPVectorizer/X86/hadd.ll | 152 ++++-
 .../SLPVectorizer/X86/hsub-inseltpoison.ll    | 153 ++++-
 .../test/Transforms/SLPVectorizer/X86/hsub.ll | 153 ++++-
 .../X86/reused-extractelements.ll             |  23 +-
 8 files changed, 878 insertions(+), 426 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d9f016c98aeb17e..1264627ee721ade 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -662,6 +662,36 @@ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
   return Res;
 }
 
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+static SmallVector<std::optional<TTI::ShuffleKind>>
+tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+                           SmallVectorImpl<int> &Mask, unsigned NumParts) {
+  assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
+  SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
+  Mask.assign(VL.size(), PoisonMaskElem);
+  unsigned SliceSize = VL.size() / NumParts;
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    // Scan list of gathered scalars for extractelements that can be represented
+    // as shuffles.
+    MutableArrayRef<Value *> SubVL =
+        MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+    SmallVector<int> SubMask;
+    std::optional<TTI::ShuffleKind> Res =
+        tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
+    ShufflesRes[Part] = Res;
+    copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
+  }
+  if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
+        return Res.has_value();
+      }))
+    ShufflesRes.clear();
+  return ShufflesRes;
+}
+
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -7152,101 +7182,80 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
   };
 
-  /// Compute the cost of creating a vector of type \p VecTy containing the
-  /// extracted values from \p VL.
-  InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
-                                     TTI::ShuffleKind ShuffleKind) {
-    unsigned NumElts = 0;
-    for (Value *V : VL) {
-      auto *EE = dyn_cast<ExtractElementInst>(V);
-      if (!EE)
-        continue;
-      auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
-      NumElts = std::max(NumElts, VecTy->getNumElements());
-    }
-    assert(NumElts > 0 &&
-           "Expected at least 1-element fixed length vector(s).");
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), NumElts);
-    unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-    if (!NumOfParts || NumElts < NumOfParts)
-      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-    unsigned EltsPerVector = PowerOf2Ceil(divideCeil(NumElts, NumOfParts));
-    int ValNum = -1;
-    int ValIdx = -1;
-    // Check that if trying to permute 2 input vectors (which may result in
-    // several vector registers), each per-register subvector is the result of
-    // the permutation of 2 single registers.
-    if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc &&
-        !all_of(enumerate(Mask), [&](auto &&Arg) {
-          if (Arg.value() == PoisonMaskElem)
-            return true;
-          int CurValNum = (Arg.value() % NumElts) / EltsPerVector;
-          int CurValIdx = Arg.index() / EltsPerVector;
-          if (ValIdx != CurValIdx) {
-            ValIdx = CurValIdx;
-            ValNum = CurValNum;
-            return true;
-          }
-          return CurValNum == ValNum;
-        }))
-      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
+  /// Compute the cost of creating a vector containing the extracted values from
+  /// \p VL.
+  InstructionCost
+  computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+                     ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+                     unsigned NumParts) {
+    assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
+    unsigned NumElts =
+        std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
+          auto *EE = dyn_cast<ExtractElementInst>(V);
+          if (!EE)
+            return Sz;
+          auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
+          return std::max(Sz, VecTy->getNumElements());
+        });
+    unsigned NumSrcRegs = TTI.getNumberOfParts(
+        FixedVectorType::get(VL.front()->getType(), NumElts));
+    if (NumSrcRegs == 0)
+      NumSrcRegs = 1;
+    // FIXME: this must be moved to TTI for better estimation.
+    unsigned EltsPerVector = PowerOf2Ceil(std::max(
+        divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+    auto CheckPerRegistersShuffle =
+        [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+      DenseSet<int> RegIndices;
+      // Check that if trying to permute same single/2 input vectors.
+      TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
+      int FirstRegId = -1;
+      for (int &I : Mask) {
+        if (I == PoisonMaskElem)
+          continue;
+        int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
+        if (FirstRegId < 0)
+          FirstRegId = RegId;
+        RegIndices.insert(RegId);
+        if (RegIndices.size() > 2)
+          return std::nullopt;
+        if (RegIndices.size() == 2)
+          ShuffleKind = TTI::SK_PermuteTwoSrc;
+        I = (I % NumElts) % EltsPerVector +
+            (RegId == FirstRegId ? 0 : EltsPerVector);
+      }
+      return ShuffleKind;
+    };
     InstructionCost Cost = 0;
 
     // Process extracts in blocks of EltsPerVector to check if the source vector
     // operand can be re-used directly. If not, add the cost of creating a
     // shuffle to extract the values into a vector register.
-    auto *RegisterVecTy =
-        FixedVectorType::get(VL.front()->getType(), EltsPerVector);
-    SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
-    TTI::ShuffleKind RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
-    Value *VecBase = nullptr;
-    bool IsIdentity = true;
-    for (auto [Idx, V] : enumerate(VL)) {
-      // Reached the start of a new vector registers.
-      if (Idx % EltsPerVector == 0) {
-        RegMask.assign(EltsPerVector, PoisonMaskElem);
-        RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
-        VecBase = nullptr;
-      }
-
-      // Need to exclude undefs from analysis.
-      if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      if (!ShuffleKinds[Part])
         continue;
-
-      // Check all extracts for a vector register on the target directly
-      // extract values in order.
-      unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
-      unsigned PrevIdx = CurrentIdx;
-      if (Idx % EltsPerVector != 0 && !isa<UndefValue>(VL[Idx - 1]) &&
-          Mask[Idx - 1] != PoisonMaskElem)
-        PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])) + 1;
-      if (!VecBase) {
-        VecBase = cast<ExtractElementInst>(V)->getVectorOperand();
-        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-        IsIdentity = CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-      } else if (VecBase != cast<ExtractElementInst>(V)->getVectorOperand()) {
-        IsIdentity = false;
-        RegisterSK = TargetTransformInfo::SK_PermuteTwoSrc;
-        RegMask[Idx % EltsPerVector] =
-            CurrentIdx % EltsPerVector + EltsPerVector;
-      } else {
-        IsIdentity &= PrevIdx == CurrentIdx &&
-                      CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-      }
-
-      if (IsIdentity)
+      ArrayRef<int> MaskSlice =
+          Mask.slice(Part * EltsPerVector,
+                     (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
+                         ? Mask.size() % EltsPerVector
+                         : EltsPerVector);
+      SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
+      copy(MaskSlice, SubMask.begin());
+      std::optional<TTI::ShuffleKind> RegShuffleKind =
+          CheckPerRegistersShuffle(SubMask);
+      if (!RegShuffleKind) {
+        Cost += TTI.getShuffleCost(
+            *ShuffleKinds[Part],
+            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
         continue;
-
-      // Skip all indices, except for the last index per vector block.
-      if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
-        continue;
-
-      // If we have a series of extracts which are not consecutive and hence
-      // cannot re-use the source vector register directly, compute the shuffle
-      // cost to extract the vector with EltsPerVector elements.
-      Cost += TTI.getShuffleCost(RegisterSK, RegisterVecTy, RegMask);
+      }
+      if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
+          !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
+        Cost += TTI.getShuffleCost(
+            *RegShuffleKind,
+            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
+            SubMask);
+      }
     }
     return Cost;
   }
@@ -7464,90 +7473,76 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                        SmallPtrSetImpl<Value *> &CheckedExtracts)
       : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
         R(R), CheckedExtracts(CheckedExtracts) {}
-  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
-                        TTI::ShuffleKind ShuffleKind) {
+  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+                        unsigned NumParts) {
     if (Mask.empty())
       return nullptr;
     Value *VecBase = nullptr;
     ArrayRef<Value *> VL = E->Scalars;
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
     // If the resulting type is scalarized, do not adjust the cost.
-    unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
-    if (VecNumParts == VecTy->getNumElements())
+    if (NumParts == VL.size())
       return nullptr;
-    DenseMap<Value *, int> ExtractVectorsTys;
-    for (auto [I, V] : enumerate(VL)) {
-      // Ignore non-extractelement scalars.
-      if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
-        continue;
-      // If all users of instruction are going to be vectorized and this
-      // instruction itself is not going to be vectorized, consider this
-      // instruction as dead and remove its cost from the final cost of the
-      // vectorized tree.
-      // Also, avoid adjusting the cost for extractelements with multiple uses
-      // in different graph entries.
-      const TreeEntry *VE = R.getTreeEntry(V);
-      if (!CheckedExtracts.insert(V).second ||
-          !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
-          (VE && VE != E))
-        continue;
-      auto *EE = cast<ExtractElementInst>(V);
-      VecBase = EE->getVectorOperand();
-      std::optional<unsigned> EEIdx = getExtractIndex(EE);
-      if (!EEIdx)
-        continue;
-      unsigned Idx = *EEIdx;
-      if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
-        auto It =
-            ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
-        It->getSecond() = std::min<int>(It->second, Idx);
-      }
-      // Take credit for instruction that will become dead.
-      if (EE->hasOneUse()) {
-        Instruction *Ext = EE->user_back();
-        if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
-              return isa<GetElementPtrInst>(U);
-            })) {
-          // Use getExtractWithExtendCost() to calculate the cost of
-          // extractelement/ext pair.
-          Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
-                                               EE->getVectorOperandType(), Idx);
-          // Add back the cost of s|zext which is subtracted separately.
-          Cost += TTI.getCastInstrCost(
-              Ext->getOpcode(), Ext->getType(), EE->getType(),
-              TTI::getCastContextHint(Ext), CostKind, Ext);
+    // Check if it can be considered reused if same extractelements were
+    // vectorized already.
+    bool PrevNodeFound = any_of(
+        ArrayRef(R.VectorizableTree).take_front(E->Idx),
+        [&](const std::unique_ptr<TreeEntry> &TE) {
+          return ((!TE->isAltShuffle() &&
+                   TE->getOpcode() == Instruction::ExtractElement) ||
+                  TE->State == TreeEntry::NeedToGather) &&
+                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
+                   return VL.size() > Data.index() &&
+                          (Mask[Data.index()] == PoisonMaskElem ||
+                           isa<UndefValue>(VL[Data.index()]) ||
+                           Data.value() == VL[Data.index()]);
+                 });
+        });
+    unsigned SliceSize = VL.size() / NumParts;
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+        // Ignore non-extractelement scalars.
+        if (isa<UndefValue>(V) ||
+            (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
           continue;
-        }
-      }
-      Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
-                                     Idx);
-    }
-    // Add a cost for subvector extracts/inserts if required.
-    for (const auto &Data : ExtractVectorsTys) {
-      auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
-      unsigned NumElts = VecTy->getNumElements();
-      if (Data.second % NumElts == 0)
-        continue;
-      if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
-        unsigned Idx = (Data.second / NumElts) * NumElts;
-        unsigned EENumElts = EEVTy->getNumElements();
-        if (Idx % NumElts == 0)
+        // If all users of instruction are going to be vectorized and this
+        // instruction itself is not going to be vectorized, consider this
+        // instruction as dead and remove its cost from the final cost of the
+        // vectorized tree.
+        // Also, avoid adjusting the cost for extractelements with multiple uses
+        // in different graph entries.
+        const TreeEntry *VE = R.getTreeEntry(V);
+        if (!CheckedExtracts.insert(V).second ||
+            !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+            (VE && VE != E))
           continue;
-        if (Idx + NumElts <= EENumElts) {
-          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                     EEVTy, std::nullopt, CostKind, Idx, VecTy);
-        } else {
-          // Need to round up the subvector type vectorization factor to avoid a
-          // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
-          // <= EENumElts.
-          auto *SubVT =
-              FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
-          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                     EEVTy, std::nullopt, CostKind, Idx, SubVT);
+        auto *EE = cast<ExtractElementInst>(V);
+        VecBase = EE->getVectorOperand();
+        std::optional<unsigned> EEIdx = getExtractIndex(EE);
+        if (!EEIdx)
+          continue;
+        unsigned Idx = *EEIdx;
+        // Take credit for instruction that will become dead.
+        if (EE->hasOneUse() || !PrevNodeFound) {
+          Instruction *Ext = EE->user_back();
+          if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+                return isa<GetElementPtrInst>(U);
+              })) {
+            // Use getExtractWithExtendCost() to calculate the cost of
+            // extractelement/ext pair.
+            Cost -=
+                TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+                                             EE->getVectorOperandType(), Idx);
+            // Add back the cost of s|zext which is subtracted separately.
+            Cost += TTI.getCastInstrCost(
+                Ext->getOpcode(), Ext->getType(), EE->getType(),
+                TTI::getCastContextHint(Ext), CostKind, Ext);
+            continue;
+          }
         }
-      } else {
-        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
-                                   VecTy, std::nullopt, CostKind, 0, EEVTy);
+        Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
+                                       CostKind, Idx);
       }
     }
     // Check that gather of extractelements can be represented as just a
@@ -7555,7 +7550,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // Found the bunch of extractelement instructions that must be gathered
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
-    Cost += computeExtractCost(VL, Mask, ShuffleKind);
+    // Done for reused if same extractelements were vectorized already.
+    if (!PrevNodeFound)
+      Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
     InVectors.assign(1, E);
     CommonMask.assign(Mask.begin(), Mask.end());
     transformMaskAfterShuffle(CommonMask, CommonMask);
@@ -7677,7 +7674,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     assert((IsFinalized || CommonMask.empty()) &&
            "Shuffle construction must be finalized.");
   }
-};
+  };
 
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
@@ -7738,40 +7735,41 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       reorderScalars(GatheredScalars, ReorderMask);
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
-    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
     SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
     SmallVector<SmallVector<const TreeEntry *>> Entries;
+    SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
-
     bool Resized = false;
     unsigned NumParts = TTI->getNumberOfParts(VecTy);
     if (NumParts == 0 || NumParts >= GatheredScalars.size())
       NumParts = 1;
-    if (Value *VecBase = Estimator.adjustExtracts(
-            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
+    if (!all_of(GatheredScalars, UndefValue::classof)) {
+      ExtractShuffles =
+          tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+      if (!ExtractShuffles.empty()) {
+        if (Value *VecBase = Estimator.adjustExtracts(
+                E, ExtractMask, ExtractShuffles, NumParts)) {
+          if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+            if (VF == VecBaseTy->getNumElements() &&
+                GatheredScalars.size() != VF) {
+              Resized = true;
+              GatheredScalars.append(VF - GatheredScalars.size(),
+                                     PoisonValue::get(ScalarTy));
+            }
         }
-    } else if (ExtractShuffle &&
-               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
-      copy(VL, GatheredScalars.begin());
-    }
+      }
 
-    // Do not try to look for reshuffled loads for gathered loads (they will be
-    // handled later), for vectorized scalars, and cases, which are definitely
-    // not profitable (splats and small gather nodes.)
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
-        E->isAltShuffle() ||
-        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
-        isSplat(E->Scalars) ||
-        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffles =
-          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+      // Do not try to look for reshuffled loads for gathered loads (they will
+      // be handled later), for vectorized scalars, and cases, which are
+      // definitely not profitable (splats and small gather nodes.)
+      if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
+          E->isAltShuffle() ||
+          all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+          isSplat(E->Scalars) ||
+          (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
+        GatherShuffles =
+            isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+    }
     if (!GatherShuffles.empty()) {
       if (GatherShuffles.size() == 1 &&
           *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
@@ -10013,7 +10011,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       : Builder(Builder), R(R) {}
 
   /// Adjusts extractelements after reusing them.
-  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
+  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+                        unsigned NumParts, bool &UseVecBaseAsInput) {
+    UseVecBaseAsInput = false;
+    SmallPtrSet<Value *, 4> UniqueBases;
     Value *VecBase = nullptr;
     for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
       int Idx = Mask[I];
@@ -10021,6 +10022,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
       VecBase = EI->getVectorOperand();
+      UniqueBases.insert(VecBase);
       // If the only one use is vectorized - can delete the extractelement
       // itself.
       if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
@@ -10029,7 +10031,74 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       R.eraseInstruction(EI);
     }
-    return VecBase;
+    if (NumParts == 1 || UniqueBases.size() == 1)
+      return VecBase;
+    UseVecBaseAsInput = true;
+    auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
+      for (auto [I, Idx] : enumerate(Mask))
+        if (Idx != PoisonMaskElem)
+          Idx = I;
+    };
+    // Perform multi-register vector shuffle, joining them into a single virtual
+    // long vector.
+    // Need to shuffle each part independently and then insert all this parts
+    // into a long virtual vector register, forming the original vector.
+    Value *Vec = nullptr;
+    SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+    unsigned SliceSize = E->Scalars.size() / NumParts;
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      ArrayRef<Value *> VL =
+          ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
+      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+      constexpr int MaxBases = 2;
+      SmallVector<Value *, MaxBases> Bases(MaxBases);
+#ifndef NDEBUG
+      int PrevSize = 0;
+#endif // NDEBUG
+      for (const auto [I, V]: enumerate(VL)) {
+        if (SubMask[I] == PoisonMaskElem)
+          continue;
+        Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
+        const int Size =
+            cast<FixedVectorType>(VecOp->getType())->getNumElements();
+#ifndef NDEBUG
+        assert((PrevSize == Size || PrevSize == 0) &&
+               "Expected vectors of the same size.");
+        PrevSize = Size;
+#endif // NDEBUG
+        Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
+      }
+      if (!Bases.front())
+        continue;
+      Value *SubVec;
+      if (Bases.back()) {
+        SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
+        TransformToIdentity(SubMask);
+      } else {
+        SubVec = Bases.front();
+      }
+      if (!Vec) {
+        Vec = SubVec;
+        assert(Part == 0 && "Expected firs part.");
+        copy(SubMask, VecMask.begin());
+      } else {
+        unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+        if (Vec->getType() != SubVec->getType()) {
+          unsigned SubVecVF =
+              cast<FixedVectorType>(SubVec->getType())->getNumElements();
+          VF = std::max(VF, SubVecVF);
+        }
+        // Adjust SubMask.
+        for (auto [I, Idx] : enumerate(SubMask))
+          if (Idx != PoisonMaskElem)
+            Idx += VF;
+        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
+        Vec = createShuffle(Vec, SubVec, VecMask);
+        TransformToIdentity(VecMask);
+      }
+    }
+    copy(VecMask, Mask.begin());
+    return Vec;
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
@@ -10372,29 +10441,37 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   BVTy ShuffleBuilder(Params...);
   ResTy Res = ResTy();
   SmallVector<int> Mask;
-  SmallVector<int> ExtractMask;
-  std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
+  SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
+  SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
+  Value *ExtractVecBase = nullptr;
+  bool UseVecBaseAsInput = false;
   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
   SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
-  unsigned NumParts = TTI->getNumberOfParts(
-      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
+  auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
+  unsigned NumParts = TTI->getNumberOfParts(VecTy);
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
     NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
     bool Resized = false;
-    if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
-        }
+    ExtractShuffles =
+        tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+    if (!ExtractShuffles.empty()) {
+      if (Value *VecBase = ShuffleBuilder.adjustExtracts(
+              E, ExtractMask, NumParts, UseVecBaseAsInput)) {
+        ExtractVecBase = VecBase;
+        if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+          if (VF == VecBaseTy->getNumElements() &&
+              GatheredScalars.size() != VF) {
+            Resized = true;
+            GatheredScalars.append(VF - GatheredScalars.size(),
+                                   PoisonValue::get(ScalarTy));
+          }
+      }
+    }
     // Gather extracts after we check for full matched gathers only.
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
         E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
@@ -10545,30 +10622,35 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (ExtractShuffle || !GatherShuffles.empty()) {
+  if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
     bool IsNonPoisoned = true;
     bool IsUsedInExpr = true;
     Value *Vec1 = nullptr;
-    if (ExtractShuffle) {
+    if (!ExtractShuffles.empty()) {
       // Gather of extractelements can be represented as just a shuffle of
       // a single/two vectors the scalars are extracted from.
       // Find input vectors.
       Value *Vec2 = nullptr;
       for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
-        if (ExtractMask[I] == PoisonMaskElem ||
-            (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
+        if (!Mask.empty() && Mask[I] != PoisonMaskElem)
           ExtractMask[I] = PoisonMaskElem;
-          continue;
-        }
-        if (isa<UndefValue>(E->Scalars[I]))
-          continue;
-        auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
-        if (!Vec1) {
-          Vec1 = EI->getVectorOperand();
-        } else if (Vec1 != EI->getVectorOperand()) {
-          assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
-                 "Expected only 1 or 2 vectors shuffle.");
-          Vec2 = EI->getVectorOperand();
+      }
+      if (UseVecBaseAsInput) {
+        Vec1 = ExtractVecBase;
+      } else {
+        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+          if (ExtractMask[I] == PoisonMaskElem)
+            continue;
+          if (isa<UndefValue>(E->Scalars[I]))
+            continue;
+          auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+          if (!Vec1) {
+            Vec1 = EI->getVectorOperand();
+          } else if (Vec1 != EI->getVectorOperand()) {
+            assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+                   "Expected only 1 or 2 vectors shuffle.");
+            Vec2 = EI->getVectorOperand();
+          }
         }
       }
       if (Vec2) {
@@ -10629,10 +10711,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
+    bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
     bool IsIdentityShuffle =
-        (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
-             TTI::SK_PermuteSingleSrc &&
+        ((UseVecBaseAsInput ||
+          all_of(ExtractShuffles,
+                 [](const std::optional<TTI::ShuffleKind> &SK) {
+                   return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+                          TTI::SK_PermuteSingleSrc;
+                 })) &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
         (!GatherShuffles.empty() &&
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index e60e356e5cd8195..8f76b2e54e6c2d3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -75,64 +75,47 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]]
-; CHECK-NEXT:    [[CONV48:%.*]] = trunc i64 [[ADD]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]]
-; CHECK-NEXT:    [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]]
-; CHECK-NEXT:    [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]]
-; CHECK-NEXT:    [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127
 ; CHECK-NEXT:    [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31
 ; CHECK-NEXT:    br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]]
 ; CHECK:       while.body88:
 ; CHECK-NEXT:    [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP34:%.*]], [[WHILE_END121]] ], [ [[TMP17]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY93:%.*]]
 ; CHECK:       while.body93:
-; CHECK-NEXT:    [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP19]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP20]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ [[TMP18]], [[WHILE_BODY88]] ], [ [[TMP34]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[AND94:%.*]] = and i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[AND95:%.*]] = and i32 [[B_0278]], 1
 ; CHECK-NEXT:    [[SHR96]] = lshr i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[SHR97]] = lshr i32 [[B_0278]], 1
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0
-; CHECK-NEXT:    [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false
-; CHECK-NEXT:    [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32
-; CHECK-NEXT:    [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]]
-; CHECK-NEXT:    [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0
-; CHECK-NEXT:    [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false
-; CHECK-NEXT:    [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32
-; CHECK-NEXT:    [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false
-; CHECK-NEXT:    [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32
-; CHECK-NEXT:    [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false
-; CHECK-NEXT:    [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32
-; CHECK-NEXT:    [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
+; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]]
@@ -144,61 +127,53 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]]
 ; CHECK:       while.end122:
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP17]], [[WHILE_END]] ], [ [[TMP34]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]]
 ; CHECK:       while.body132.preheader:
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
 ; CHECK-NEXT:    [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]]
-; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
-; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]]
+; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP36]], [[SUB125]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP37]], [[SUB125]]
 ; CHECK-NEXT:    br label [[WHILE_BODY132:%.*]]
 ; CHECK:       while.body132:
-; CHECK-NEXT:    [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP51:%.*]], [[WHILE_BODY132]] ], [ [[TMP35]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[AND133:%.*]] = and i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[AND134:%.*]] = and i32 [[B_1300]], 1
 ; CHECK-NEXT:    [[SHR135]] = lshr i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[SHR136]] = lshr i32 [[B_1300]], 1
-; CHECK-NEXT:    [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0
-; CHECK-NEXT:    [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false
-; CHECK-NEXT:    [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32
-; CHECK-NEXT:    [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]]
-; CHECK-NEXT:    [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0
-; CHECK-NEXT:    [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false
-; CHECK-NEXT:    [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32
-; CHECK-NEXT:    [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false
-; CHECK-NEXT:    [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32
-; CHECK-NEXT:    [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false
-; CHECK-NEXT:    [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32
-; CHECK-NEXT:    [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]]
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
+; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
 ; CHECK-NEXT:    [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1
 ; CHECK-NEXT:    [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
 ; CHECK:       while.end166:
-; CHECK-NEXT:    [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
+; CHECK-NEXT:    store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
+; CHECK-NEXT:    store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
+; CHECK-NEXT:    store i32 [[TMP55]], ptr [[CTF:%.*]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
+; CHECK-NEXT:    store i32 [[TMP56]], ptr [[CFT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
index de99654d84eb81f..c2369a6a89ec1de 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; YAML-NEXT:  Function:        foo
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-3'
+; YAML-NEXT:    - Cost:            '-4'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '10'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 0217ddcac004687..4a9f717918a029c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -166,11 +166,31 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -266,11 +286,31 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -308,11 +348,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -334,11 +394,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -376,11 +456,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -448,5 +548,3 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index c38d116a7a323c6..cac6845c43004f0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -166,11 +166,31 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -266,11 +286,31 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -308,11 +348,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -334,11 +394,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -376,11 +456,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -448,5 +548,3 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index 39400ba4ce1e83a..40b6a8c32f5d029 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -145,11 +145,31 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -171,11 +191,31 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -213,11 +253,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -239,11 +299,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -281,11 +361,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -354,9 +454,6 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
-; SLM: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 6b63de83c56be28..09113323d3ab77a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -145,11 +145,31 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -171,11 +191,31 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -213,11 +253,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -239,11 +299,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -281,11 +361,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -354,9 +454,6 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
-; SLM: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
index 35cb8c729e106f0..94a1d7aa1951c4f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
@@ -2,23 +2,24 @@
 ; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -pass-remarks-output=%t | FileCheck %s
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
-; YAML: --- !Passed
+; YAML: --- !Missed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Name:            NotBeneficial
 ; YAML-NEXT: Function:        g
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '-1'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '4'
+; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+; YAML-NEXT:   - Cost:            '0'
+; YAML-NEXT:   - String:          ' >= '
+; YAML-NEXT:   - Treshold:        '0'
 
 define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
 ; CHECK-LABEL: @g(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i32 [[X1]], [[X1]]
+; CHECK-NEXT:    [[AB:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i32> poison, i32 [[X1X1]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i32> [[INS1]], i32 [[AB]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[INS2]]
 ;
   %x1 = extractelement <2 x i32> %x, i32 1
   %x1x1 = mul i32 %x1, %x1

>From edd690b02e16e991393bf7f67631196942369aed Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav at gmail.com>
Date: Fri, 3 Nov 2023 21:45:39 +0400
Subject: [PATCH 54/76] [clang][NFC] Refactor `TagTypeKind` (#71160)

This patch converts TagTypeKind into scoped enum. Among other benefits,
this allows us to forward-declare it where necessary.
---
 clang-tools-extra/clang-doc/BitcodeReader.cpp |  14 +--
 clang-tools-extra/clang-doc/BitcodeWriter.cpp |   4 +-
 clang-tools-extra/clang-doc/Generators.cpp    |  10 +-
 .../clang-doc/Representation.cpp              |   2 +-
 clang-tools-extra/clang-doc/Representation.h  |   2 +-
 clang-tools-extra/clang-doc/YAMLGenerator.cpp |  10 +-
 .../clangd/refactor/InsertionPoint.cpp        |   3 +-
 .../unittests/clang-doc/BitcodeTest.cpp       |   2 +-
 .../unittests/clang-doc/HTMLGeneratorTest.cpp |   2 +-
 .../unittests/clang-doc/MDGeneratorTest.cpp   |   2 +-
 .../unittests/clang-doc/MergeTest.cpp         |   6 +-
 .../unittests/clang-doc/SerializeTest.cpp     |  26 ++--
 .../unittests/clang-doc/YAMLGeneratorTest.cpp |   2 +-
 clang/include/clang/AST/ASTContext.h          |   5 +-
 clang/include/clang/AST/Decl.h                |  14 ++-
 clang/include/clang/AST/Type.h                |  12 +-
 clang/lib/AST/ASTContext.cpp                  |  12 +-
 clang/lib/AST/Decl.cpp                        |  12 +-
 clang/lib/AST/DeclCXX.cpp                     |  22 ++--
 clang/lib/AST/DeclTemplate.cpp                |   2 +-
 clang/lib/AST/MicrosoftMangle.cpp             | 104 ++++++++--------
 clang/lib/AST/RecordLayoutBuilder.cpp         |   9 +-
 clang/lib/AST/Type.cpp                        |  35 +++---
 clang/lib/CodeGen/CGObjCMac.cpp               |  14 +--
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |   2 +-
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |   2 +-
 .../Frontend/Rewrite/RewriteModernObjC.cpp    |  37 +++---
 clang/lib/Frontend/Rewrite/RewriteObjC.cpp    |  32 ++---
 clang/lib/Index/IndexSymbol.cpp               |  10 +-
 clang/lib/Index/USRGeneration.cpp             |  46 ++++---
 clang/lib/Sema/HLSLExternalSemaSource.cpp     |   6 +-
 clang/lib/Sema/Sema.cpp                       |   5 +-
 clang/lib/Sema/SemaCodeComplete.cpp           |  27 +++--
 clang/lib/Sema/SemaDecl.cpp                   | 112 ++++++++++--------
 clang/lib/Sema/SemaDeclAttr.cpp               |   2 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |  25 ++--
 clang/lib/Sema/SemaDeclObjC.cpp               |   2 +-
 clang/lib/Sema/SemaExprCXX.cpp                |   9 +-
 clang/lib/Sema/SemaStmt.cpp                   |   5 +-
 clang/lib/Sema/SemaTemplate.cpp               |  16 ++-
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |   4 +-
 clang/lib/Sema/SemaType.cpp                   |  30 +++--
 clang/lib/Sema/TreeTransform.h                |  10 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |   5 +-
 clang/lib/Serialization/ASTWriter.cpp         |   2 +-
 clang/lib/Serialization/ASTWriterDecl.cpp     |   2 +-
 .../WebKit/NoUncountedMembersChecker.cpp      |   2 +-
 .../WebKit/RefCntblBaseVirtualDtorChecker.cpp |   2 +-
 clang/tools/libclang/CIndexCXX.cpp            |  14 ++-
 clang/unittests/AST/ASTImporterTest.cpp       |   5 +-
 clang/utils/ClangVisualizers/clang.natvis     |  10 +-
 .../Plugins/Language/ObjC/NSDictionary.cpp    |   3 +-
 .../AppleObjCTypeEncodingParser.cpp           |   4 +-
 .../Platform/FreeBSD/PlatformFreeBSD.cpp      |   6 +-
 .../Plugins/Platform/Linux/PlatformLinux.cpp  |   8 +-
 .../Platform/NetBSD/PlatformNetBSD.cpp        |  10 +-
 .../RegisterTypeBuilderClang.cpp              |   3 +-
 .../Plugins/SymbolFile/CTF/SymbolFileCTF.cpp  |  14 +--
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  |  11 +-
 .../SymbolFile/NativePDB/PdbAstBuilder.cpp    |  23 ++--
 .../NativePDB/UdtRecordCompleter.cpp          |   8 +-
 .../Plugins/SymbolFile/PDB/PDBASTParser.cpp   |  14 +--
 .../MacOSX/SystemRuntimeMacOSX.cpp            |   3 +-
 .../TypeSystem/Clang/TypeSystemClang.cpp      |   5 +-
 lldb/unittests/Symbol/TestTypeSystemClang.cpp |  27 +++--
 65 files changed, 486 insertions(+), 403 deletions(-)

diff --git a/clang-tools-extra/clang-doc/BitcodeReader.cpp b/clang-tools-extra/clang-doc/BitcodeReader.cpp
index 9ac60fa73a782b4..8c97186b299fc2e 100644
--- a/clang-tools-extra/clang-doc/BitcodeReader.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeReader.cpp
@@ -67,13 +67,13 @@ llvm::Error decodeRecord(const Record &R, AccessSpecifier &Field,
 
 llvm::Error decodeRecord(const Record &R, TagTypeKind &Field,
                          llvm::StringRef Blob) {
-  switch (R[0]) {
-  case TTK_Struct:
-  case TTK_Interface:
-  case TTK_Union:
-  case TTK_Class:
-  case TTK_Enum:
-    Field = (TagTypeKind)R[0];
+  switch (static_cast<TagTypeKind>(R[0])) {
+  case TagTypeKind::Struct:
+  case TagTypeKind::Interface:
+  case TagTypeKind::Union:
+  case TagTypeKind::Class:
+  case TagTypeKind::Enum:
+    Field = static_cast<TagTypeKind>(R[0]);
     return llvm::Error::success();
   default:
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
diff --git a/clang-tools-extra/clang-doc/BitcodeWriter.cpp b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
index 2ece99a3124ab55..7e5a11783d303a6 100644
--- a/clang-tools-extra/clang-doc/BitcodeWriter.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
@@ -551,7 +551,7 @@ void ClangDocBitcodeWriter::emitBlock(const RecordInfo &I) {
     emitRecord(*I.DefLoc, RECORD_DEFLOCATION);
   for (const auto &L : I.Loc)
     emitRecord(L, RECORD_LOCATION);
-  emitRecord(I.TagType, RECORD_TAG_TYPE);
+  emitRecord(llvm::to_underlying(I.TagType), RECORD_TAG_TYPE);
   emitRecord(I.IsTypeDef, RECORD_IS_TYPE_DEF);
   for (const auto &N : I.Members)
     emitBlock(N);
@@ -578,7 +578,7 @@ void ClangDocBitcodeWriter::emitBlock(const BaseRecordInfo &I) {
   emitRecord(I.USR, BASE_RECORD_USR);
   emitRecord(I.Name, BASE_RECORD_NAME);
   emitRecord(I.Path, BASE_RECORD_PATH);
-  emitRecord(I.TagType, BASE_RECORD_TAG_TYPE);
+  emitRecord(llvm::to_underlying(I.TagType), BASE_RECORD_TAG_TYPE);
   emitRecord(I.IsVirtual, BASE_RECORD_IS_VIRTUAL);
   emitRecord(I.Access, BASE_RECORD_ACCESS);
   emitRecord(I.IsParent, BASE_RECORD_IS_PARENT);
diff --git a/clang-tools-extra/clang-doc/Generators.cpp b/clang-tools-extra/clang-doc/Generators.cpp
index da19c05ab67b9c7..a3986b66f3c7421 100644
--- a/clang-tools-extra/clang-doc/Generators.cpp
+++ b/clang-tools-extra/clang-doc/Generators.cpp
@@ -28,15 +28,15 @@ findGeneratorByName(llvm::StringRef Format) {
 
 std::string getTagType(TagTypeKind AS) {
   switch (AS) {
-  case TagTypeKind::TTK_Class:
+  case TagTypeKind::Class:
     return "class";
-  case TagTypeKind::TTK_Union:
+  case TagTypeKind::Union:
     return "union";
-  case TagTypeKind::TTK_Interface:
+  case TagTypeKind::Interface:
     return "interface";
-  case TagTypeKind::TTK_Struct:
+  case TagTypeKind::Struct:
     return "struct";
-  case TagTypeKind::TTK_Enum:
+  case TagTypeKind::Enum:
     return "enum";
   }
   llvm_unreachable("Unknown TagTypeKind");
diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp
index 3d078d575a12763..7dcf646322e6f69 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -239,7 +239,7 @@ RecordInfo::RecordInfo(SymbolID USR, StringRef Name, StringRef Path)
 
 void RecordInfo::merge(RecordInfo &&Other) {
   assert(mergeable(Other));
-  if (!TagType)
+  if (!llvm::to_underlying(TagType))
     TagType = Other.TagType;
   IsTypeDef = IsTypeDef || Other.IsTypeDef;
   if (Members.empty())
diff --git a/clang-tools-extra/clang-doc/Representation.h b/clang-tools-extra/clang-doc/Representation.h
index aa5fa7f07083eba..a6b144eb7fa2a21 100644
--- a/clang-tools-extra/clang-doc/Representation.h
+++ b/clang-tools-extra/clang-doc/Representation.h
@@ -350,7 +350,7 @@ struct RecordInfo : public SymbolInfo {
   void merge(RecordInfo &&I);
 
   // Type of this record (struct, class, union, interface).
-  TagTypeKind TagType = TagTypeKind::TTK_Struct;
+  TagTypeKind TagType = TagTypeKind::Struct;
 
   // Full qualified name of this record, including namespaces and template
   // specializations.
diff --git a/clang-tools-extra/clang-doc/YAMLGenerator.cpp b/clang-tools-extra/clang-doc/YAMLGenerator.cpp
index 57cb294e0ab6949..b612380a3cfa7fa 100644
--- a/clang-tools-extra/clang-doc/YAMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/YAMLGenerator.cpp
@@ -47,11 +47,11 @@ template <> struct ScalarEnumerationTraits<clang::AccessSpecifier> {
 
 template <> struct ScalarEnumerationTraits<clang::TagTypeKind> {
   static void enumeration(IO &IO, clang::TagTypeKind &Value) {
-    IO.enumCase(Value, "Struct", clang::TagTypeKind::TTK_Struct);
-    IO.enumCase(Value, "Interface", clang::TagTypeKind::TTK_Interface);
-    IO.enumCase(Value, "Union", clang::TagTypeKind::TTK_Union);
-    IO.enumCase(Value, "Class", clang::TagTypeKind::TTK_Class);
-    IO.enumCase(Value, "Enum", clang::TagTypeKind::TTK_Enum);
+    IO.enumCase(Value, "Struct", clang::TagTypeKind::Struct);
+    IO.enumCase(Value, "Interface", clang::TagTypeKind::Interface);
+    IO.enumCase(Value, "Union", clang::TagTypeKind::Union);
+    IO.enumCase(Value, "Class", clang::TagTypeKind::Class);
+    IO.enumCase(Value, "Enum", clang::TagTypeKind::Enum);
   }
 };
 
diff --git a/clang-tools-extra/clangd/refactor/InsertionPoint.cpp b/clang-tools-extra/clangd/refactor/InsertionPoint.cpp
index 4b04021a5bd46d7..e3b11ceb4f01644 100644
--- a/clang-tools-extra/clangd/refactor/InsertionPoint.cpp
+++ b/clang-tools-extra/clangd/refactor/InsertionPoint.cpp
@@ -85,7 +85,8 @@ SourceLocation endLoc(const DeclContext &DC) {
 }
 
 AccessSpecifier getAccessAtEnd(const CXXRecordDecl &C) {
-  AccessSpecifier Spec = (C.getTagKind() == TTK_Class ? AS_private : AS_public);
+  AccessSpecifier Spec =
+      (C.getTagKind() == TagTypeKind::Class ? AS_private : AS_public);
   for (const auto *D : C.decls())
     if (const auto *ASD = llvm::dyn_cast<AccessSpecDecl>(D))
       Spec = ASD->getAccess();
diff --git a/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp b/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
index 53e84b18144530d..5b9745af7810a9a 100644
--- a/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
@@ -81,7 +81,7 @@ TEST(BitcodeTest, emitRecordInfoBitcode) {
   I.Loc.emplace_back(12, llvm::SmallString<16>{"test.cpp"});
 
   I.Members.emplace_back(TypeInfo("int"), "X", AccessSpecifier::AS_private);
-  I.TagType = TagTypeKind::TTK_Class;
+  I.TagType = TagTypeKind::Class;
   I.IsTypeDef = true;
   I.Bases.emplace_back(EmptySID, "F", "path/to/F", true,
                        AccessSpecifier::AS_public, true);
diff --git a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
index 51412591616533b..9aabb1ed30e4265 100644
--- a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
@@ -154,7 +154,7 @@ TEST(HTMLGeneratorTest, emitRecordHTML) {
   SmallString<16> PathTo;
   llvm::sys::path::native("path/to", PathTo);
   I.Members.emplace_back(TypeInfo("int"), "X", AccessSpecifier::AS_private);
-  I.TagType = TagTypeKind::TTK_Class;
+  I.TagType = TagTypeKind::Class;
   I.Parents.emplace_back(EmptySID, "F", InfoType::IT_record, "F", PathTo);
   I.VirtualParents.emplace_back(EmptySID, "G", InfoType::IT_record);
 
diff --git a/clang-tools-extra/unittests/clang-doc/MDGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/MDGeneratorTest.cpp
index 1bbd24eebb784a2..d3567efd82283cd 100644
--- a/clang-tools-extra/unittests/clang-doc/MDGeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MDGeneratorTest.cpp
@@ -86,7 +86,7 @@ TEST(MDGeneratorTest, emitRecordMD) {
   I.Loc.emplace_back(12, llvm::SmallString<16>{"test.cpp"});
 
   I.Members.emplace_back(TypeInfo("int"), "X", AccessSpecifier::AS_private);
-  I.TagType = TagTypeKind::TTK_Class;
+  I.TagType = TagTypeKind::Class;
   I.Parents.emplace_back(EmptySID, "F", InfoType::IT_record);
   I.VirtualParents.emplace_back(EmptySID, "G", InfoType::IT_record);
 
diff --git a/clang-tools-extra/unittests/clang-doc/MergeTest.cpp b/clang-tools-extra/unittests/clang-doc/MergeTest.cpp
index 51fddb97decf0ea..9aa76524194b3b0 100644
--- a/clang-tools-extra/unittests/clang-doc/MergeTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MergeTest.cpp
@@ -84,7 +84,7 @@ TEST(MergeTest, mergeRecordInfos) {
   One.DefLoc = Location(10, llvm::SmallString<16>{"test.cpp"});
 
   One.Members.emplace_back(TypeInfo("int"), "X", AccessSpecifier::AS_private);
-  One.TagType = TagTypeKind::TTK_Class;
+  One.TagType = TagTypeKind::Class;
   One.Parents.emplace_back(EmptySID, "F", InfoType::IT_record);
   One.VirtualParents.emplace_back(EmptySID, "G", InfoType::IT_record);
 
@@ -105,7 +105,7 @@ TEST(MergeTest, mergeRecordInfos) {
 
   Two.Loc.emplace_back(12, llvm::SmallString<16>{"test.cpp"});
 
-  Two.TagType = TagTypeKind::TTK_Class;
+  Two.TagType = TagTypeKind::Class;
 
   Two.Children.Records.emplace_back(NonEmptySID, "SharedChildStruct",
                                     InfoType::IT_record, "path");
@@ -128,7 +128,7 @@ TEST(MergeTest, mergeRecordInfos) {
 
   Expected->Members.emplace_back(TypeInfo("int"), "X",
                                  AccessSpecifier::AS_private);
-  Expected->TagType = TagTypeKind::TTK_Class;
+  Expected->TagType = TagTypeKind::Class;
   Expected->Parents.emplace_back(EmptySID, "F", InfoType::IT_record);
   Expected->VirtualParents.emplace_back(EmptySID, "G", InfoType::IT_record);
   Expected->Bases.emplace_back(EmptySID, "F", "path/to/F", true,
diff --git a/clang-tools-extra/unittests/clang-doc/SerializeTest.cpp b/clang-tools-extra/unittests/clang-doc/SerializeTest.cpp
index ca370de5a516e4a..5df42b9f5bca0b0 100644
--- a/clang-tools-extra/unittests/clang-doc/SerializeTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/SerializeTest.cpp
@@ -167,7 +167,7 @@ typedef struct {} G;)raw",
   RecordInfo ExpectedE(EmptySID, /*Name=*/"E", /*Path=*/"GlobalNamespace");
   ExpectedE.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
-  ExpectedE.TagType = TagTypeKind::TTK_Class;
+  ExpectedE.TagType = TagTypeKind::Class;
   ExpectedE.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
   ExpectedE.Members.emplace_back(TypeInfo("int"), "value",
                                  AccessSpecifier::AS_public);
@@ -210,7 +210,7 @@ typedef struct {} G;)raw",
   RecordInfo ExpectedF(EmptySID, /*Name=*/"F", /*Path=*/"GlobalNamespace");
   ExpectedF.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
-  ExpectedF.TagType = TagTypeKind::TTK_Struct;
+  ExpectedF.TagType = TagTypeKind::Struct;
   ExpectedF.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
   CheckRecordInfo(&ExpectedF, F);
 
@@ -253,7 +253,7 @@ typedef struct {} G;)raw",
   RecordInfo ExpectedG(EmptySID, /*Name=*/"G", /*Path=*/"GlobalNamespace");
   ExpectedG.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
-  ExpectedG.TagType = TagTypeKind::TTK_Struct;
+  ExpectedG.TagType = TagTypeKind::Struct;
   ExpectedG.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
   ExpectedG.IsTypeDef = true;
   CheckRecordInfo(&ExpectedG, G);
@@ -295,7 +295,7 @@ TEST(SerializeTest, emitUndefinedRecordInfo) {
   RecordInfo ExpectedE(EmptySID, /*Name=*/"E", /*Path=*/"GlobalNamespace");
   ExpectedE.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
-  ExpectedE.TagType = TagTypeKind::TTK_Class;
+  ExpectedE.TagType = TagTypeKind::Class;
   ExpectedE.Loc.emplace_back(0, llvm::SmallString<16>{"test.cpp"});
   CheckRecordInfo(&ExpectedE, E);
 }
@@ -308,7 +308,7 @@ TEST(SerializeTest, emitRecordMemberInfo) {
   RecordInfo ExpectedE(EmptySID, /*Name=*/"E", /*Path=*/"GlobalNamespace");
   ExpectedE.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
-  ExpectedE.TagType = TagTypeKind::TTK_Struct;
+  ExpectedE.TagType = TagTypeKind::Struct;
   ExpectedE.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
   ExpectedE.Members.emplace_back(TypeInfo("int"), "I",
                                  AccessSpecifier::AS_public);
@@ -324,7 +324,7 @@ TEST(SerializeTest, emitInternalRecordInfo) {
   ExpectedE.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
   ExpectedE.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
-  ExpectedE.TagType = TagTypeKind::TTK_Class;
+  ExpectedE.TagType = TagTypeKind::Class;
   CheckRecordInfo(&ExpectedE, E);
 
   RecordInfo *G = InfoAsRecord(Infos[2].get());
@@ -332,7 +332,7 @@ TEST(SerializeTest, emitInternalRecordInfo) {
   llvm::sys::path::native(ExpectedGPath);
   RecordInfo ExpectedG(EmptySID, /*Name=*/"G", /*Path=*/ExpectedGPath);
   ExpectedG.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
-  ExpectedG.TagType = TagTypeKind::TTK_Class;
+  ExpectedG.TagType = TagTypeKind::Class;
   ExpectedG.Namespace.emplace_back(EmptySID, "E", InfoType::IT_record);
   ExpectedG.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
@@ -391,7 +391,7 @@ class J : public I<int> {} ;)raw",
   RecordInfo ExpectedF(EmptySID, /*Name=*/"F", /*Path=*/"GlobalNamespace");
   ExpectedF.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace, "");
-  ExpectedF.TagType = TagTypeKind::TTK_Class;
+  ExpectedF.TagType = TagTypeKind::Class;
   ExpectedF.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
   CheckRecordInfo(&ExpectedF, F);
 
@@ -399,7 +399,7 @@ class J : public I<int> {} ;)raw",
   RecordInfo ExpectedG(EmptySID, /*Name=*/"G", /*Path=*/"GlobalNamespace");
   ExpectedG.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
-  ExpectedG.TagType = TagTypeKind::TTK_Class;
+  ExpectedG.TagType = TagTypeKind::Class;
   ExpectedG.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
   ExpectedG.Members.emplace_back(TypeInfo("int"), "I",
                                  AccessSpecifier::AS_protected);
@@ -446,14 +446,14 @@ class J : public I<int> {} ;)raw",
   ExpectedE.Bases.back().Members.emplace_back(TypeInfo("int"), "I",
                                               AccessSpecifier::AS_private);
   ExpectedE.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
-  ExpectedE.TagType = TagTypeKind::TTK_Class;
+  ExpectedE.TagType = TagTypeKind::Class;
   CheckRecordInfo(&ExpectedE, E);
 
   RecordInfo *H = InfoAsRecord(Infos[8].get());
   RecordInfo ExpectedH(EmptySID, /*Name=*/"H", /*Path=*/"GlobalNamespace");
   ExpectedH.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
-  ExpectedH.TagType = TagTypeKind::TTK_Class;
+  ExpectedH.TagType = TagTypeKind::Class;
   ExpectedH.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
   ExpectedH.Parents.emplace_back(EmptySID, /*Name=*/"E", InfoType::IT_record,
                                  /*QualName=*/"E", /*Path=*/"GlobalNamespace");
@@ -500,7 +500,7 @@ class J : public I<int> {} ;)raw",
   RecordInfo ExpectedI(EmptySID, /*Name=*/"I", /*Path=*/"GlobalNamespace");
   ExpectedI.Namespace.emplace_back(EmptySID, "GlobalNamespace",
                                    InfoType::IT_namespace);
-  ExpectedI.TagType = TagTypeKind::TTK_Class;
+  ExpectedI.TagType = TagTypeKind::Class;
   ExpectedI.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
   CheckRecordInfo(&ExpectedI, I);
 
@@ -514,7 +514,7 @@ class J : public I<int> {} ;)raw",
                                /*Path=*/"GlobalNamespace", false,
                                AccessSpecifier::AS_public, true);
   ExpectedJ.DefLoc = Location(0, llvm::SmallString<16>{"test.cpp"});
-  ExpectedJ.TagType = TagTypeKind::TTK_Class;
+  ExpectedJ.TagType = TagTypeKind::Class;
   CheckRecordInfo(&ExpectedJ, J);
 }
 
diff --git a/clang-tools-extra/unittests/clang-doc/YAMLGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/YAMLGeneratorTest.cpp
index 535d49169b5dfff..9b1d87dfa013566 100644
--- a/clang-tools-extra/unittests/clang-doc/YAMLGeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/YAMLGeneratorTest.cpp
@@ -101,7 +101,7 @@ TEST(YAMLGeneratorTest, emitRecordYAML) {
   Brief->Children.back()->Text = "Value of the thing.";
   I.Members.back().Description.push_back(std::move(TopComment));
 
-  I.TagType = TagTypeKind::TTK_Class;
+  I.TagType = TagTypeKind::Class;
   I.Bases.emplace_back(EmptySID, "F", "path/to/F", true,
                        AccessSpecifier::AS_public, true);
   I.Bases.back().Children.Functions.emplace_back();
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index e5f78dfdc22ab1b..6bb308247223989 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1197,8 +1197,9 @@ class ASTContext : public RefCountedBase<ASTContext> {
 
   /// Create a new implicit TU-level CXXRecordDecl or RecordDecl
   /// declaration.
-  RecordDecl *buildImplicitRecord(StringRef Name,
-                                  RecordDecl::TagKind TK = TTK_Struct) const;
+  RecordDecl *buildImplicitRecord(
+      StringRef Name,
+      RecordDecl::TagKind TK = RecordDecl::TagKind::Struct) const;
 
   /// Create a new implicit TU-level typedef declaration.
   TypedefDecl *buildImplicitTypedef(QualType T, StringRef Name) const;
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index d8ea8c1dfb4f292..d8495f2c34940c5 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -3703,13 +3703,15 @@ class TagDecl : public TypeDecl,
     return static_cast<TagKind>(TagDeclBits.TagDeclKind);
   }
 
-  void setTagKind(TagKind TK) { TagDeclBits.TagDeclKind = TK; }
+  void setTagKind(TagKind TK) {
+    TagDeclBits.TagDeclKind = llvm::to_underlying(TK);
+  }
 
-  bool isStruct() const { return getTagKind() == TTK_Struct; }
-  bool isInterface() const { return getTagKind() == TTK_Interface; }
-  bool isClass()  const { return getTagKind() == TTK_Class; }
-  bool isUnion()  const { return getTagKind() == TTK_Union; }
-  bool isEnum()   const { return getTagKind() == TTK_Enum; }
+  bool isStruct() const { return getTagKind() == TagTypeKind::Struct; }
+  bool isInterface() const { return getTagKind() == TagTypeKind::Interface; }
+  bool isClass() const { return getTagKind() == TagTypeKind::Class; }
+  bool isUnion() const { return getTagKind() == TagTypeKind::Union; }
+  bool isEnum() const { return getTagKind() == TagTypeKind::Enum; }
 
   /// Is this tag type named, either directly or via being defined in
   /// a typedef of this type?
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index f64cd5e0ef64910..42e9b8f94b4e7b6 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -5718,21 +5718,21 @@ enum class ElaboratedTypeKeyword {
 };
 
 /// The kind of a tag type.
-enum TagTypeKind {
+enum class TagTypeKind {
   /// The "struct" keyword.
-  TTK_Struct,
+  Struct,
 
   /// The "__interface" keyword.
-  TTK_Interface,
+  Interface,
 
   /// The "union" keyword.
-  TTK_Union,
+  Union,
 
   /// The "class" keyword.
-  TTK_Class,
+  Class,
 
   /// The "enum" keyword.
-  TTK_Enum
+  Enum
 };
 
 /// A helper class for Type nodes having an ElaboratedTypeKeyword.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 1cb81cffd37ea58..da90136752210b6 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -6525,12 +6525,12 @@ bool ASTContext::isSameEntity(const NamedDecl *X, const NamedDecl *Y) const {
   if (const auto *TagX = dyn_cast<TagDecl>(X)) {
     const auto *TagY = cast<TagDecl>(Y);
     return (TagX->getTagKind() == TagY->getTagKind()) ||
-           ((TagX->getTagKind() == TTK_Struct ||
-             TagX->getTagKind() == TTK_Class ||
-             TagX->getTagKind() == TTK_Interface) &&
-            (TagY->getTagKind() == TTK_Struct ||
-             TagY->getTagKind() == TTK_Class ||
-             TagY->getTagKind() == TTK_Interface));
+           ((TagX->getTagKind() == TagTypeKind::Struct ||
+             TagX->getTagKind() == TagTypeKind::Class ||
+             TagX->getTagKind() == TagTypeKind::Interface) &&
+            (TagY->getTagKind() == TagTypeKind::Struct ||
+             TagY->getTagKind() == TagTypeKind::Class ||
+             TagY->getTagKind() == TagTypeKind::Interface));
   }
 
   // Functions with the same type and linkage match.
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 42ba8582c46de84..ed848e574233468 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4639,8 +4639,8 @@ TagDecl::TagDecl(Kind DK, TagKind TK, const ASTContext &C, DeclContext *DC,
                  SourceLocation StartL)
     : TypeDecl(DK, DC, L, Id, StartL), DeclContext(DK), redeclarable_base(C),
       TypedefNameDeclOrQualifier((TypedefNameDecl *)nullptr) {
-  assert((DK != Enum || TK == TTK_Enum) &&
-         "EnumDecl not matched with TTK_Enum");
+  assert((DK != Enum || TK == TagTypeKind::Enum) &&
+         "EnumDecl not matched with TagTypeKind::Enum");
   setPreviousDecl(PrevDecl);
   setTagKind(TK);
   setCompleteDefinition(false);
@@ -4773,7 +4773,7 @@ void TagDecl::setTemplateParameterListsInfo(
 EnumDecl::EnumDecl(ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
                    SourceLocation IdLoc, IdentifierInfo *Id, EnumDecl *PrevDecl,
                    bool Scoped, bool ScopedUsingClassTag, bool Fixed)
-    : TagDecl(Enum, TTK_Enum, C, DC, IdLoc, Id, PrevDecl, StartLoc) {
+    : TagDecl(Enum, TagTypeKind::Enum, C, DC, IdLoc, Id, PrevDecl, StartLoc) {
   assert(Scoped || !ScopedUsingClassTag);
   IntegerType = nullptr;
   setNumPositiveBits(0);
@@ -4962,9 +4962,9 @@ RecordDecl *RecordDecl::Create(const ASTContext &C, TagKind TK, DeclContext *DC,
 }
 
 RecordDecl *RecordDecl::CreateDeserialized(const ASTContext &C, unsigned ID) {
-  RecordDecl *R =
-      new (C, ID) RecordDecl(Record, TTK_Struct, C, nullptr, SourceLocation(),
-                             SourceLocation(), nullptr, nullptr);
+  RecordDecl *R = new (C, ID)
+      RecordDecl(Record, TagTypeKind::Struct, C, nullptr, SourceLocation(),
+                 SourceLocation(), nullptr, nullptr);
   R->setMayHaveOutOfDateDef(C.getLangOpts().Modules);
   return R;
 }
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 4002c63e9f94c12..c944862fcefeeef 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -148,8 +148,8 @@ CXXRecordDecl::CreateLambda(const ASTContext &C, DeclContext *DC,
                             TypeSourceInfo *Info, SourceLocation Loc,
                             unsigned DependencyKind, bool IsGeneric,
                             LambdaCaptureDefault CaptureDefault) {
-  auto *R = new (C, DC) CXXRecordDecl(CXXRecord, TTK_Class, C, DC, Loc, Loc,
-                                      nullptr, nullptr);
+  auto *R = new (C, DC) CXXRecordDecl(CXXRecord, TagTypeKind::Class, C, DC, Loc,
+                                      Loc, nullptr, nullptr);
   R->setBeingDefined(true);
   R->DefinitionData = new (C) struct LambdaDefinitionData(
       R, Info, DependencyKind, IsGeneric, CaptureDefault);
@@ -162,9 +162,9 @@ CXXRecordDecl::CreateLambda(const ASTContext &C, DeclContext *DC,
 
 CXXRecordDecl *
 CXXRecordDecl::CreateDeserialized(const ASTContext &C, unsigned ID) {
-  auto *R = new (C, ID) CXXRecordDecl(
-      CXXRecord, TTK_Struct, C, nullptr, SourceLocation(), SourceLocation(),
-      nullptr, nullptr);
+  auto *R = new (C, ID)
+      CXXRecordDecl(CXXRecord, TagTypeKind::Struct, C, nullptr,
+                    SourceLocation(), SourceLocation(), nullptr, nullptr);
   R->setMayHaveOutOfDateDef(false);
   return R;
 }
@@ -692,11 +692,10 @@ bool CXXRecordDecl::lambdaIsDefaultConstructibleAndAssignable() const {
 }
 
 void CXXRecordDecl::addedMember(Decl *D) {
-  if (!D->isImplicit() &&
-      !isa<FieldDecl>(D) &&
-      !isa<IndirectFieldDecl>(D) &&
-      (!isa<TagDecl>(D) || cast<TagDecl>(D)->getTagKind() == TTK_Class ||
-        cast<TagDecl>(D)->getTagKind() == TTK_Interface))
+  if (!D->isImplicit() && !isa<FieldDecl>(D) && !isa<IndirectFieldDecl>(D) &&
+      (!isa<TagDecl>(D) ||
+       cast<TagDecl>(D)->getTagKind() == TagTypeKind::Class ||
+       cast<TagDecl>(D)->getTagKind() == TagTypeKind::Interface))
     data().HasOnlyCMembers = false;
 
   // Ignore friends and invalid declarations.
@@ -1510,7 +1509,8 @@ void CXXRecordDecl::setTrivialForCallFlags(CXXMethodDecl *D) {
 }
 
 bool CXXRecordDecl::isCLike() const {
-  if (getTagKind() == TTK_Class || getTagKind() == TTK_Interface ||
+  if (getTagKind() == TagTypeKind::Class ||
+      getTagKind() == TagTypeKind::Interface ||
       !TemplateOrInstantiation.isNull())
     return false;
   if (!hasDefinition())
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index be385ca1152546e..7d7556e670f951a 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -920,7 +920,7 @@ ClassTemplateSpecializationDecl(ASTContext &Context, Kind DK, TagKind TK,
 
 ClassTemplateSpecializationDecl::ClassTemplateSpecializationDecl(ASTContext &C,
                                                                  Kind DK)
-    : CXXRecordDecl(DK, TTK_Struct, C, nullptr, SourceLocation(),
+    : CXXRecordDecl(DK, TagTypeKind::Struct, C, nullptr, SourceLocation(),
                     SourceLocation(), nullptr, nullptr),
       SpecializationKind(TSK_Undeclared) {}
 
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index b7cfcbc9dfa4677..50ab6ea59be9d03 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -1312,9 +1312,9 @@ void MicrosoftCXXNameMangler::mangleNestedName(GlobalDecl GD) {
       if (PointersAre64Bit)
         Out << 'E';
       Out << 'A';
-      mangleArtificialTagType(TTK_Struct,
-                             Discriminate("__block_literal", Discriminator,
-                                          ParameterDiscriminator));
+      mangleArtificialTagType(TagTypeKind::Struct,
+                              Discriminate("__block_literal", Discriminator,
+                                           ParameterDiscriminator));
       Out << "@Z";
 
       // If the effective context was a Record, we have fully mangled the
@@ -1974,9 +1974,9 @@ void MicrosoftCXXNameMangler::mangleObjCProtocol(const ObjCProtocolDecl *PD) {
 
   Stream << "?$";
   Extra.mangleSourceName("Protocol");
-  Extra.mangleArtificialTagType(TTK_Struct, PD->getName());
+  Extra.mangleArtificialTagType(TagTypeKind::Struct, PD->getName());
 
-  mangleArtificialTagType(TTK_Struct, TemplateMangling, {"__ObjC"});
+  mangleArtificialTagType(TagTypeKind::Struct, TemplateMangling, {"__ObjC"});
 }
 
 void MicrosoftCXXNameMangler::mangleObjCLifetime(const QualType Type,
@@ -2005,7 +2005,7 @@ void MicrosoftCXXNameMangler::mangleObjCLifetime(const QualType Type,
   Extra.manglePointerExtQualifiers(Quals, Type);
   Extra.mangleType(Type, Range);
 
-  mangleArtificialTagType(TTK_Struct, TemplateMangling, {"__ObjC"});
+  mangleArtificialTagType(TagTypeKind::Struct, TemplateMangling, {"__ObjC"});
 }
 
 void MicrosoftCXXNameMangler::mangleObjCKindOfType(const ObjCObjectType *T,
@@ -2022,7 +2022,7 @@ void MicrosoftCXXNameMangler::mangleObjCKindOfType(const ObjCObjectType *T,
                        ->castAs<ObjCObjectType>(),
                    Quals, Range);
 
-  mangleArtificialTagType(TTK_Struct, TemplateMangling, {"__ObjC"});
+  mangleArtificialTagType(TagTypeKind::Struct, TemplateMangling, {"__ObjC"});
 }
 
 void MicrosoftCXXNameMangler::mangleQualifiers(Qualifiers Quals,
@@ -2223,7 +2223,8 @@ void MicrosoftCXXNameMangler::manglePassObjectSizeArg(
   if (Found == FunArgBackReferences.end()) {
     std::string Name =
         Dynamic ? "__pass_dynamic_object_size" : "__pass_object_size";
-    mangleArtificialTagType(TTK_Enum, Name + llvm::utostr(Type), {"__clang"});
+    mangleArtificialTagType(TagTypeKind::Enum, Name + llvm::utostr(Type),
+                            {"__clang"});
 
     if (FunArgBackReferences.size() < 10) {
       size_t Size = FunArgBackReferences.size();
@@ -2304,7 +2305,7 @@ void MicrosoftCXXNameMangler::mangleAddressSpaceType(QualType T,
 
   Extra.mangleType(T, Range, QMM_Escape);
   mangleQualifiers(Qualifiers(), false);
-  mangleArtificialTagType(TTK_Struct, ASMangling, {"__clang"});
+  mangleArtificialTagType(TagTypeKind::Struct, ASMangling, {"__clang"});
 }
 
 void MicrosoftCXXNameMangler::mangleType(QualType T, SourceRange Range,
@@ -2486,13 +2487,13 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
     llvm_unreachable("placeholder types shouldn't get to name mangling");
 
   case BuiltinType::ObjCId:
-    mangleArtificialTagType(TTK_Struct, "objc_object");
+    mangleArtificialTagType(TagTypeKind::Struct, "objc_object");
     break;
   case BuiltinType::ObjCClass:
-    mangleArtificialTagType(TTK_Struct, "objc_class");
+    mangleArtificialTagType(TagTypeKind::Struct, "objc_class");
     break;
   case BuiltinType::ObjCSel:
-    mangleArtificialTagType(TTK_Struct, "objc_selector");
+    mangleArtificialTagType(TagTypeKind::Struct, "objc_selector");
     break;
 
 #define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
@@ -2502,27 +2503,27 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
 #include "clang/Basic/OpenCLImageTypes.def"
   case BuiltinType::OCLSampler:
     Out << "PA";
-    mangleArtificialTagType(TTK_Struct, "ocl_sampler");
+    mangleArtificialTagType(TagTypeKind::Struct, "ocl_sampler");
     break;
   case BuiltinType::OCLEvent:
     Out << "PA";
-    mangleArtificialTagType(TTK_Struct, "ocl_event");
+    mangleArtificialTagType(TagTypeKind::Struct, "ocl_event");
     break;
   case BuiltinType::OCLClkEvent:
     Out << "PA";
-    mangleArtificialTagType(TTK_Struct, "ocl_clkevent");
+    mangleArtificialTagType(TagTypeKind::Struct, "ocl_clkevent");
     break;
   case BuiltinType::OCLQueue:
     Out << "PA";
-    mangleArtificialTagType(TTK_Struct, "ocl_queue");
+    mangleArtificialTagType(TagTypeKind::Struct, "ocl_queue");
     break;
   case BuiltinType::OCLReserveID:
     Out << "PA";
-    mangleArtificialTagType(TTK_Struct, "ocl_reserveid");
+    mangleArtificialTagType(TagTypeKind::Struct, "ocl_reserveid");
     break;
-#define EXT_OPAQUE_TYPE(ExtType, Id, Ext) \
-  case BuiltinType::Id: \
-    mangleArtificialTagType(TTK_Struct, "ocl_" #ExtType); \
+#define EXT_OPAQUE_TYPE(ExtType, Id, Ext)                                      \
+  case BuiltinType::Id:                                                        \
+    mangleArtificialTagType(TagTypeKind::Struct, "ocl_" #ExtType);             \
     break;
 #include "clang/Basic/OpenCLExtensionTypes.def"
 
@@ -2531,12 +2532,12 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
     break;
 
   case BuiltinType::Float16:
-    mangleArtificialTagType(TTK_Struct, "_Float16", {"__clang"});
+    mangleArtificialTagType(TagTypeKind::Struct, "_Float16", {"__clang"});
     break;
 
   case BuiltinType::Half:
     if (!getASTContext().getLangOpts().HLSL)
-      mangleArtificialTagType(TTK_Struct, "_Half", {"__clang"});
+      mangleArtificialTagType(TagTypeKind::Struct, "_Half", {"__clang"});
     else if (getASTContext().getLangOpts().NativeHalfType)
       Out << "$f16@";
     else
@@ -2544,13 +2545,13 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
     break;
 
   case BuiltinType::BFloat16:
-    mangleArtificialTagType(TTK_Struct, "__bf16", {"__clang"});
+    mangleArtificialTagType(TagTypeKind::Struct, "__bf16", {"__clang"});
     break;
 
 #define WASM_REF_TYPE(InternalName, MangledName, Id, SingletonId, AS)          \
   case BuiltinType::Id:                                                        \
-    mangleArtificialTagType(TTK_Struct, MangledName);                          \
-    mangleArtificialTagType(TTK_Struct, MangledName, {"__clang"});             \
+    mangleArtificialTagType(TagTypeKind::Struct, MangledName);                 \
+    mangleArtificialTagType(TagTypeKind::Struct, MangledName, {"__clang"});    \
     break;
 
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
@@ -2917,19 +2918,19 @@ void MicrosoftCXXNameMangler::mangleType(const UnresolvedUsingType *T,
 // <enum-type>   ::= W4 <name>
 void MicrosoftCXXNameMangler::mangleTagTypeKind(TagTypeKind TTK) {
   switch (TTK) {
-    case TTK_Union:
-      Out << 'T';
-      break;
-    case TTK_Struct:
-    case TTK_Interface:
-      Out << 'U';
-      break;
-    case TTK_Class:
-      Out << 'V';
-      break;
-    case TTK_Enum:
-      Out << "W4";
-      break;
+  case TagTypeKind::Union:
+    Out << 'T';
+    break;
+  case TagTypeKind::Struct:
+  case TagTypeKind::Interface:
+    Out << 'U';
+    break;
+  case TagTypeKind::Class:
+    Out << 'V';
+    break;
+  case TagTypeKind::Enum:
+    Out << "W4";
+    break;
   }
 }
 void MicrosoftCXXNameMangler::mangleType(const EnumType *T, Qualifiers,
@@ -3139,11 +3140,11 @@ void MicrosoftCXXNameMangler::mangleType(const ComplexType *T, Qualifiers,
   Extra.mangleSourceName("_Complex");
   Extra.mangleType(ElementType, Range, QMM_Escape);
 
-  mangleArtificialTagType(TTK_Struct, TemplateMangling, {"__clang"});
+  mangleArtificialTagType(TagTypeKind::Struct, TemplateMangling, {"__clang"});
 }
 
 // Returns true for types that mangleArtificialTagType() gets called for with
-// TTK_Union, TTK_Struct, TTK_Class and where compatibility with MSVC's
+// TagTypeKind Union, Struct, Class and where compatibility with MSVC's
 // mangling matters.
 // (It doesn't matter for Objective-C types and the like that cl.exe doesn't
 // support.)
@@ -3176,14 +3177,17 @@ void MicrosoftCXXNameMangler::mangleType(const VectorType *T, Qualifiers Quals,
   if (!isa<ExtVectorType>(T)) {
     if (getASTContext().getTargetInfo().getTriple().isX86() && ET) {
       if (Width == 64 && ET->getKind() == BuiltinType::LongLong) {
-        mangleArtificialTagType(TTK_Union, "__m64");
+        mangleArtificialTagType(TagTypeKind::Union, "__m64");
       } else if (Width >= 128) {
         if (ET->getKind() == BuiltinType::Float)
-          mangleArtificialTagType(TTK_Union, "__m" + llvm::utostr(Width));
+          mangleArtificialTagType(TagTypeKind::Union,
+                                  "__m" + llvm::utostr(Width));
         else if (ET->getKind() == BuiltinType::LongLong)
-          mangleArtificialTagType(TTK_Union, "__m" + llvm::utostr(Width) + 'i');
+          mangleArtificialTagType(TagTypeKind::Union,
+                                  "__m" + llvm::utostr(Width) + 'i');
         else if (ET->getKind() == BuiltinType::Double)
-          mangleArtificialTagType(TTK_Struct, "__m" + llvm::utostr(Width) + 'd');
+          mangleArtificialTagType(TagTypeKind::Struct,
+                                  "__m" + llvm::utostr(Width) + 'd');
       }
     }
   }
@@ -3203,7 +3207,7 @@ void MicrosoftCXXNameMangler::mangleType(const VectorType *T, Qualifiers Quals,
                      Range, QMM_Escape);
     Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumElements()));
 
-    mangleArtificialTagType(TTK_Union, TemplateMangling, {"__clang"});
+    mangleArtificialTagType(TagTypeKind::Union, TemplateMangling, {"__clang"});
   }
 }
 
@@ -3259,7 +3263,7 @@ void MicrosoftCXXNameMangler::mangleType(const DependentAddressSpaceType *T,
 void MicrosoftCXXNameMangler::mangleType(const ObjCInterfaceType *T, Qualifiers,
                                          SourceRange) {
   // ObjC interfaces have structs underlying them.
-  mangleTagTypeKind(TTK_Struct);
+  mangleTagTypeKind(TagTypeKind::Struct);
   mangleName(T->getDecl());
 }
 
@@ -3279,7 +3283,7 @@ void MicrosoftCXXNameMangler::mangleType(const ObjCObjectType *T,
   TemplateArgBackReferences.swap(OuterTemplateArgsContext);
   NameBackReferences.swap(OuterTemplateContext);
 
-  mangleTagTypeKind(TTK_Struct);
+  mangleTagTypeKind(TagTypeKind::Struct);
 
   Out << "?$";
   if (T->isObjCId())
@@ -3427,7 +3431,7 @@ void MicrosoftCXXNameMangler::mangleType(const AtomicType *T, Qualifiers,
   Extra.mangleSourceName("_Atomic");
   Extra.mangleType(ValueType, Range, QMM_Escape);
 
-  mangleArtificialTagType(TTK_Struct, TemplateMangling, {"__clang"});
+  mangleArtificialTagType(TagTypeKind::Struct, TemplateMangling, {"__clang"});
 }
 
 void MicrosoftCXXNameMangler::mangleType(const PipeType *T, Qualifiers,
@@ -3442,7 +3446,7 @@ void MicrosoftCXXNameMangler::mangleType(const PipeType *T, Qualifiers,
   Extra.mangleType(ElementType, Range, QMM_Escape);
   Extra.mangleIntegerLiteral(llvm::APSInt::get(T->isReadOnly()));
 
-  mangleArtificialTagType(TTK_Struct, TemplateMangling, {"__clang"});
+  mangleArtificialTagType(TagTypeKind::Struct, TemplateMangling, {"__clang"});
 }
 
 void MicrosoftMangleContextImpl::mangleCXXName(GlobalDecl GD,
@@ -3482,7 +3486,7 @@ void MicrosoftCXXNameMangler::mangleType(const BitIntType *T, Qualifiers,
     Extra.mangleSourceName("_BitInt");
   Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumBits()));
 
-  mangleArtificialTagType(TTK_Struct, TemplateMangling, {"__clang"});
+  mangleArtificialTagType(TagTypeKind::Struct, TemplateMangling, {"__clang"});
 }
 
 void MicrosoftCXXNameMangler::mangleType(const DependentBitIntType *T,
diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 5d4f930fca50e2b..a51c8b938f411c0 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -2267,9 +2267,12 @@ ItaniumRecordLayoutBuilder::updateExternalFieldOffset(const FieldDecl *Field,
 /// \returns diagnostic %select index.
 static unsigned getPaddingDiagFromTagKind(TagTypeKind Tag) {
   switch (Tag) {
-  case TTK_Struct: return 0;
-  case TTK_Interface: return 1;
-  case TTK_Class: return 2;
+  case TagTypeKind::Struct:
+    return 0;
+  case TagTypeKind::Interface:
+    return 1;
+  case TagTypeKind::Class:
+    return 2;
   default: llvm_unreachable("Invalid tag kind for field padding diagnostic!");
   }
 }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index d1cbfbd150ba53f..4dd4e926c8104a4 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3035,11 +3035,16 @@ TypeWithKeyword::getKeywordForTypeSpec(unsigned TypeSpec) {
 TagTypeKind
 TypeWithKeyword::getTagTypeKindForTypeSpec(unsigned TypeSpec) {
   switch(TypeSpec) {
-  case TST_class: return TTK_Class;
-  case TST_struct: return TTK_Struct;
-  case TST_interface: return TTK_Interface;
-  case TST_union: return TTK_Union;
-  case TST_enum: return TTK_Enum;
+  case TST_class:
+    return TagTypeKind::Class;
+  case TST_struct:
+    return TagTypeKind::Struct;
+  case TST_interface:
+    return TagTypeKind::Interface;
+  case TST_union:
+    return TagTypeKind::Union;
+  case TST_enum:
+    return TagTypeKind::Enum;
   }
 
   llvm_unreachable("Type specifier is not a tag type kind.");
@@ -3048,15 +3053,15 @@ TypeWithKeyword::getTagTypeKindForTypeSpec(unsigned TypeSpec) {
 ElaboratedTypeKeyword
 TypeWithKeyword::getKeywordForTagTypeKind(TagTypeKind Kind) {
   switch (Kind) {
-  case TTK_Class:
+  case TagTypeKind::Class:
     return ElaboratedTypeKeyword::Class;
-  case TTK_Struct:
+  case TagTypeKind::Struct:
     return ElaboratedTypeKeyword::Struct;
-  case TTK_Interface:
+  case TagTypeKind::Interface:
     return ElaboratedTypeKeyword::Interface;
-  case TTK_Union:
+  case TagTypeKind::Union:
     return ElaboratedTypeKeyword::Union;
-  case TTK_Enum:
+  case TagTypeKind::Enum:
     return ElaboratedTypeKeyword::Enum;
   }
   llvm_unreachable("Unknown tag type kind.");
@@ -3066,15 +3071,15 @@ TagTypeKind
 TypeWithKeyword::getTagTypeKindForKeyword(ElaboratedTypeKeyword Keyword) {
   switch (Keyword) {
   case ElaboratedTypeKeyword::Class:
-    return TTK_Class;
+    return TagTypeKind::Class;
   case ElaboratedTypeKeyword::Struct:
-    return TTK_Struct;
+    return TagTypeKind::Struct;
   case ElaboratedTypeKeyword::Interface:
-    return TTK_Interface;
+    return TagTypeKind::Interface;
   case ElaboratedTypeKeyword::Union:
-    return TTK_Union;
+    return TagTypeKind::Union;
   case ElaboratedTypeKeyword::Enum:
-    return TTK_Enum;
+    return TagTypeKind::Enum;
   case ElaboratedTypeKeyword::None: // Fall through.
   case ElaboratedTypeKeyword::Typename:
     llvm_unreachable("Elaborated type keyword is not a tag type kind.");
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index b7aa3a4cb6373c3..6dd7ca64e5221bd 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -5757,10 +5757,9 @@ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm)
   //   id self;
   //   Class cls;
   // }
-  RecordDecl *RD = RecordDecl::Create(Ctx, TTK_Struct,
-                                      Ctx.getTranslationUnitDecl(),
-                                      SourceLocation(), SourceLocation(),
-                                      &Ctx.Idents.get("_objc_super"));
+  RecordDecl *RD = RecordDecl::Create(
+      Ctx, TagTypeKind::Struct, Ctx.getTranslationUnitDecl(), SourceLocation(),
+      SourceLocation(), &Ctx.Idents.get("_objc_super"));
   RD->addDecl(FieldDecl::Create(Ctx, RD, SourceLocation(), SourceLocation(),
                                 nullptr, Ctx.getObjCIdType(), nullptr, nullptr,
                                 false, ICIS_NoInit));
@@ -6110,10 +6109,9 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper(CodeGen::CodeGenModul
   // };
 
   // First the clang type for struct _message_ref_t
-  RecordDecl *RD = RecordDecl::Create(Ctx, TTK_Struct,
-                                      Ctx.getTranslationUnitDecl(),
-                                      SourceLocation(), SourceLocation(),
-                                      &Ctx.Idents.get("_message_ref_t"));
+  RecordDecl *RD = RecordDecl::Create(
+      Ctx, TagTypeKind::Struct, Ctx.getTranslationUnitDecl(), SourceLocation(),
+      SourceLocation(), &Ctx.Idents.get("_message_ref_t"));
   RD->addDecl(FieldDecl::Create(Ctx, RD, SourceLocation(), SourceLocation(),
                                 nullptr, Ctx.VoidPtrTy, nullptr, nullptr, false,
                                 ICIS_NoInit));
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 34c9c02884ec555..632a44724184925 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -3041,7 +3041,7 @@ createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind,
   //         kmp_int32           liter;
   //         void *              reductions;
   //       };
-  RecordDecl *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
+  RecordDecl *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TagTypeKind::Union);
   UD->startDefinition();
   addFieldToRecordDecl(C, UD, KmpInt32Ty);
   addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 009b3f0a85a3785..370613e75420e23 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -806,7 +806,7 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
   // This is temporary until we remove the fixed sized buffer.
   ASTContext &C = CGM.getContext();
   RecordDecl *StaticRD = C.buildImplicitRecord(
-      "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
+      "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
   StaticRD->startDefinition();
   for (const RecordDecl *TeamReductionRec : TeamsReductions) {
     QualType RecTy = C.getRecordType(TeamReductionRec);
diff --git a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index c645355d5aecdb2..c458700837f4ee4 100644
--- a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -864,9 +864,9 @@ RewriteModernObjC::getIvarAccessString(ObjCIvarDecl *D) {
         CDecl = CatDecl->getClassInterface();
       std::string RecName = std::string(CDecl->getName());
       RecName += "_IMPL";
-      RecordDecl *RD =
-          RecordDecl::Create(*Context, TTK_Struct, TUDecl, SourceLocation(),
-                             SourceLocation(), &Context->Idents.get(RecName));
+      RecordDecl *RD = RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
+                                          SourceLocation(), SourceLocation(),
+                                          &Context->Idents.get(RecName));
       QualType PtrStructIMPL = Context->getPointerType(Context->getTagDeclType(RD));
       unsigned UnsignedIntSize =
       static_cast<unsigned>(Context->getTypeSize(Context->UnsignedIntTy));
@@ -2978,9 +2978,9 @@ Stmt *RewriteModernObjC::RewriteObjCDictionaryLiteralExpr(ObjCDictionaryLiteral
 // };
 QualType RewriteModernObjC::getSuperStructType() {
   if (!SuperStructDecl) {
-    SuperStructDecl = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
-                                         SourceLocation(), SourceLocation(),
-                                         &Context->Idents.get("__rw_objc_super"));
+    SuperStructDecl = RecordDecl::Create(
+        *Context, TagTypeKind::Struct, TUDecl, SourceLocation(),
+        SourceLocation(), &Context->Idents.get("__rw_objc_super"));
     QualType FieldTypes[2];
 
     // struct objc_object *object;
@@ -3006,9 +3006,9 @@ QualType RewriteModernObjC::getSuperStructType() {
 
 QualType RewriteModernObjC::getConstantStringStructType() {
   if (!ConstantStringDecl) {
-    ConstantStringDecl = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
-                                            SourceLocation(), SourceLocation(),
-                         &Context->Idents.get("__NSConstantStringImpl"));
+    ConstantStringDecl = RecordDecl::Create(
+        *Context, TagTypeKind::Struct, TUDecl, SourceLocation(),
+        SourceLocation(), &Context->Idents.get("__NSConstantStringImpl"));
     QualType FieldTypes[4];
 
     // struct objc_object *receiver;
@@ -3782,10 +3782,9 @@ QualType RewriteModernObjC::SynthesizeBitfieldGroupStructType(
                               SmallVectorImpl<ObjCIvarDecl *> &IVars) {
   std::string StructTagName;
   ObjCIvarBitfieldGroupType(IV, StructTagName);
-  RecordDecl *RD = RecordDecl::Create(*Context, TTK_Struct,
-                                      Context->getTranslationUnitDecl(),
-                                      SourceLocation(), SourceLocation(),
-                                      &Context->Idents.get(StructTagName));
+  RecordDecl *RD = RecordDecl::Create(
+      *Context, TagTypeKind::Struct, Context->getTranslationUnitDecl(),
+      SourceLocation(), SourceLocation(), &Context->Idents.get(StructTagName));
   for (unsigned i=0, e = IVars.size(); i < e; i++) {
     ObjCIvarDecl *Ivar = IVars[i];
     RD->addDecl(FieldDecl::Create(*Context, RD, SourceLocation(), SourceLocation(),
@@ -4588,7 +4587,7 @@ Stmt *RewriteModernObjC::SynthesizeBlockCall(CallExpr *Exp, const Expr *BlockExp
   const FunctionProtoType *FTP = dyn_cast<FunctionProtoType>(FT);
   // FTP will be null for closures that don't take arguments.
 
-  RecordDecl *RD = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
+  RecordDecl *RD = RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
                                       SourceLocation(), SourceLocation(),
                                       &Context->Idents.get("__block_impl"));
   QualType PtrBlock = Context->getPointerType(Context->getTagDeclType(RD));
@@ -5347,9 +5346,9 @@ Stmt *RewriteModernObjC::SynthBlockInitExpr(BlockExpr *Exp,
       RewriteByRefString(RecName, Name, ND, true);
       IdentifierInfo *II = &Context->Idents.get(RecName.c_str()
                                                 + sizeof("struct"));
-      RecordDecl *RD = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
-                                          SourceLocation(), SourceLocation(),
-                                          II);
+      RecordDecl *RD =
+          RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
+                             SourceLocation(), SourceLocation(), II);
       assert(RD && "SynthBlockInitExpr(): Can't find RecordDecl");
       QualType castT = Context->getPointerType(Context->getTagDeclType(RD));
 
@@ -7508,8 +7507,8 @@ Stmt *RewriteModernObjC::RewriteObjCIvarRefExpr(ObjCIvarRefExpr *IV) {
           std::string RecName = std::string(CDecl->getName());
           RecName += "_IMPL";
           RecordDecl *RD = RecordDecl::Create(
-              *Context, TTK_Struct, TUDecl, SourceLocation(), SourceLocation(),
-              &Context->Idents.get(RecName));
+              *Context, TagTypeKind::Struct, TUDecl, SourceLocation(),
+              SourceLocation(), &Context->Idents.get(RecName));
           QualType PtrStructIMPL = Context->getPointerType(Context->getTagDeclType(RD));
           unsigned UnsignedIntSize =
             static_cast<unsigned>(Context->getTypeSize(Context->UnsignedIntTy));
diff --git a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
index 89ffb8908d37b73..71d0e95f9bf375d 100644
--- a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
@@ -2357,7 +2357,7 @@ void RewriteObjC::SynthMsgSendFunctionDecl() {
 void RewriteObjC::SynthMsgSendSuperFunctionDecl() {
   IdentifierInfo *msgSendIdent = &Context->Idents.get("objc_msgSendSuper");
   SmallVector<QualType, 16> ArgTys;
-  RecordDecl *RD = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
+  RecordDecl *RD = RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
                                       SourceLocation(), SourceLocation(),
                                       &Context->Idents.get("objc_super"));
   QualType argT = Context->getPointerType(Context->getTagDeclType(RD));
@@ -2400,7 +2400,7 @@ void RewriteObjC::SynthMsgSendSuperStretFunctionDecl() {
   IdentifierInfo *msgSendIdent =
     &Context->Idents.get("objc_msgSendSuper_stret");
   SmallVector<QualType, 16> ArgTys;
-  RecordDecl *RD = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
+  RecordDecl *RD = RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
                                       SourceLocation(), SourceLocation(),
                                       &Context->Idents.get("objc_super"));
   QualType argT = Context->getPointerType(Context->getTagDeclType(RD));
@@ -2531,7 +2531,7 @@ Stmt *RewriteObjC::RewriteObjCStringLiteral(ObjCStringLiteral *Exp) {
 // struct objc_super { struct objc_object *receiver; struct objc_class *super; };
 QualType RewriteObjC::getSuperStructType() {
   if (!SuperStructDecl) {
-    SuperStructDecl = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
+    SuperStructDecl = RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
                                          SourceLocation(), SourceLocation(),
                                          &Context->Idents.get("objc_super"));
     QualType FieldTypes[2];
@@ -2559,9 +2559,9 @@ QualType RewriteObjC::getSuperStructType() {
 
 QualType RewriteObjC::getConstantStringStructType() {
   if (!ConstantStringDecl) {
-    ConstantStringDecl = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
-                                            SourceLocation(), SourceLocation(),
-                         &Context->Idents.get("__NSConstantStringImpl"));
+    ConstantStringDecl = RecordDecl::Create(
+        *Context, TagTypeKind::Struct, TUDecl, SourceLocation(),
+        SourceLocation(), &Context->Idents.get("__NSConstantStringImpl"));
     QualType FieldTypes[4];
 
     // struct objc_object *receiver;
@@ -3755,7 +3755,7 @@ Stmt *RewriteObjC::SynthesizeBlockCall(CallExpr *Exp, const Expr *BlockExp) {
   const FunctionProtoType *FTP = dyn_cast<FunctionProtoType>(FT);
   // FTP will be null for closures that don't take arguments.
 
-  RecordDecl *RD = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
+  RecordDecl *RD = RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
                                       SourceLocation(), SourceLocation(),
                                       &Context->Idents.get("__block_impl"));
   QualType PtrBlock = Context->getPointerType(Context->getTagDeclType(RD));
@@ -4483,9 +4483,9 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp,
       RewriteByRefString(RecName, Name, ND, true);
       IdentifierInfo *II = &Context->Idents.get(RecName.c_str()
                                                 + sizeof("struct"));
-      RecordDecl *RD = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
-                                          SourceLocation(), SourceLocation(),
-                                          II);
+      RecordDecl *RD =
+          RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
+                             SourceLocation(), SourceLocation(), II);
       assert(RD && "SynthBlockInitExpr(): Can't find RecordDecl");
       QualType castT = Context->getPointerType(Context->getTagDeclType(RD));
 
@@ -5821,9 +5821,9 @@ Stmt *RewriteObjCFragileABI::RewriteObjCIvarRefExpr(ObjCIvarRefExpr *IV) {
           std::string(clsDeclared->getIdentifier()->getName());
       RecName += "_IMPL";
       IdentifierInfo *II = &Context->Idents.get(RecName);
-      RecordDecl *RD = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
-                                          SourceLocation(), SourceLocation(),
-                                          II);
+      RecordDecl *RD =
+          RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
+                             SourceLocation(), SourceLocation(), II);
       assert(RD && "RewriteObjCIvarRefExpr(): Can't find RecordDecl");
       QualType castT = Context->getPointerType(Context->getTagDeclType(RD));
       CastExpr *castExpr = NoTypeInfoCStyleCastExpr(Context, castT,
@@ -5862,9 +5862,9 @@ Stmt *RewriteObjCFragileABI::RewriteObjCIvarRefExpr(ObjCIvarRefExpr *IV) {
           std::string(clsDeclared->getIdentifier()->getName());
       RecName += "_IMPL";
       IdentifierInfo *II = &Context->Idents.get(RecName);
-      RecordDecl *RD = RecordDecl::Create(*Context, TTK_Struct, TUDecl,
-                                          SourceLocation(), SourceLocation(),
-                                          II);
+      RecordDecl *RD =
+          RecordDecl::Create(*Context, TagTypeKind::Struct, TUDecl,
+                             SourceLocation(), SourceLocation(), II);
       assert(RD && "RewriteObjCIvarRefExpr(): Can't find RecordDecl");
       QualType castT = Context->getPointerType(Context->getTagDeclType(RD));
       CastExpr *castExpr = NoTypeInfoCStyleCastExpr(Context, castT,
diff --git a/clang/lib/Index/IndexSymbol.cpp b/clang/lib/Index/IndexSymbol.cpp
index 10384660d2cb683..c67810ad126b6e2 100644
--- a/clang/lib/Index/IndexSymbol.cpp
+++ b/clang/lib/Index/IndexSymbol.cpp
@@ -107,19 +107,19 @@ SymbolInfo index::getSymbolInfo(const Decl *D) {
 
   if (const TagDecl *TD = dyn_cast<TagDecl>(D)) {
     switch (TD->getTagKind()) {
-    case TTK_Struct:
+    case TagTypeKind::Struct:
       Info.Kind = SymbolKind::Struct; break;
-    case TTK_Union:
+    case TagTypeKind::Union:
       Info.Kind = SymbolKind::Union; break;
-    case TTK_Class:
+    case TagTypeKind::Class:
       Info.Kind = SymbolKind::Class;
       Info.Lang = SymbolLanguage::CXX;
       break;
-    case TTK_Interface:
+    case TagTypeKind::Interface:
       Info.Kind = SymbolKind::Protocol;
       Info.Lang = SymbolLanguage::CXX;
       break;
-    case TTK_Enum:
+    case TagTypeKind::Enum:
       Info.Kind = SymbolKind::Enum; break;
     }
 
diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index 0eee6fe6d820739..fb936d9fbf8ab47 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -519,11 +519,16 @@ void USRGenerator::VisitTagDecl(const TagDecl *D) {
       AlreadyStarted = true;
 
       switch (D->getTagKind()) {
-      case TTK_Interface:
-      case TTK_Class:
-      case TTK_Struct: Out << "@ST"; break;
-      case TTK_Union:  Out << "@UT"; break;
-      case TTK_Enum: llvm_unreachable("enum template");
+      case TagTypeKind::Interface:
+      case TagTypeKind::Class:
+      case TagTypeKind::Struct:
+        Out << "@ST";
+        break;
+      case TagTypeKind::Union:
+        Out << "@UT";
+        break;
+      case TagTypeKind::Enum:
+        llvm_unreachable("enum template");
       }
       VisitTemplateParameterList(ClassTmpl->getTemplateParameters());
     } else if (const ClassTemplatePartialSpecializationDecl *PartialSpec
@@ -531,11 +536,16 @@ void USRGenerator::VisitTagDecl(const TagDecl *D) {
       AlreadyStarted = true;
 
       switch (D->getTagKind()) {
-      case TTK_Interface:
-      case TTK_Class:
-      case TTK_Struct: Out << "@SP"; break;
-      case TTK_Union:  Out << "@UP"; break;
-      case TTK_Enum: llvm_unreachable("enum partial specialization");
+      case TagTypeKind::Interface:
+      case TagTypeKind::Class:
+      case TagTypeKind::Struct:
+        Out << "@SP";
+        break;
+      case TagTypeKind::Union:
+        Out << "@UP";
+        break;
+      case TagTypeKind::Enum:
+        llvm_unreachable("enum partial specialization");
       }
       VisitTemplateParameterList(PartialSpec->getTemplateParameters());
     }
@@ -543,11 +553,17 @@ void USRGenerator::VisitTagDecl(const TagDecl *D) {
 
   if (!AlreadyStarted) {
     switch (D->getTagKind()) {
-      case TTK_Interface:
-      case TTK_Class:
-      case TTK_Struct: Out << "@S"; break;
-      case TTK_Union:  Out << "@U"; break;
-      case TTK_Enum:   Out << "@E"; break;
+    case TagTypeKind::Interface:
+    case TagTypeKind::Class:
+    case TagTypeKind::Struct:
+      Out << "@S";
+      break;
+    case TagTypeKind::Union:
+      Out << "@U";
+      break;
+    case TagTypeKind::Enum:
+      Out << "@E";
+      break;
     }
   }
 
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 0be76d4b36e046c..8de144486c91df6 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -62,9 +62,9 @@ struct BuiltinTypeDeclBuilder {
       return;
     }
 
-    Record = CXXRecordDecl::Create(AST, TagDecl::TagKind::TTK_Class,
-                                   HLSLNamespace, SourceLocation(),
-                                   SourceLocation(), &II, PrevDecl, true);
+    Record = CXXRecordDecl::Create(AST, TagDecl::TagKind::Class, HLSLNamespace,
+                                   SourceLocation(), SourceLocation(), &II,
+                                   PrevDecl, true);
     Record->setImplicit(true);
     Record->setLexicalDeclContext(HLSLNamespace);
     Record->setHasExternalLexicalStorage();
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 8e78d2c11f92d61..d25edd470334f63 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -329,8 +329,9 @@ void Sema::Initialize() {
   if (getLangOpts().MSVCCompat) {
     if (getLangOpts().CPlusPlus &&
         IdResolver.begin(&Context.Idents.get("type_info")) == IdResolver.end())
-      PushOnScopeChains(Context.buildImplicitRecord("type_info", TTK_Class),
-                        TUScope);
+      PushOnScopeChains(
+          Context.buildImplicitRecord("type_info", TagTypeKind::Class),
+          TUScope);
 
     addImplicitTypedef("size_t", Context.getSizeType());
   }
diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index adb82d3f6d176ab..3355336d8c2c816 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -1576,8 +1576,9 @@ bool ResultBuilder::IsClassOrStruct(const NamedDecl *ND) const {
 
   // For purposes of this check, interfaces match too.
   if (const auto *RD = dyn_cast<RecordDecl>(ND))
-    return RD->getTagKind() == TTK_Class || RD->getTagKind() == TTK_Struct ||
-           RD->getTagKind() == TTK_Interface;
+    return RD->getTagKind() == TagTypeKind::Class ||
+           RD->getTagKind() == TagTypeKind::Struct ||
+           RD->getTagKind() == TagTypeKind::Interface;
 
   return false;
 }
@@ -1589,7 +1590,7 @@ bool ResultBuilder::IsUnion(const NamedDecl *ND) const {
     ND = ClassTemplate->getTemplatedDecl();
 
   if (const auto *RD = dyn_cast<RecordDecl>(ND))
-    return RD->getTagKind() == TTK_Union;
+    return RD->getTagKind() == TagTypeKind::Union;
 
   return false;
 }
@@ -2018,15 +2019,15 @@ static const char *GetCompletionTypeString(QualType T, ASTContext &Context,
       if (TagDecl *Tag = TagT->getDecl())
         if (!Tag->hasNameForLinkage()) {
           switch (Tag->getTagKind()) {
-          case TTK_Struct:
+          case TagTypeKind::Struct:
             return "struct <anonymous>";
-          case TTK_Interface:
+          case TagTypeKind::Interface:
             return "__interface <anonymous>";
-          case TTK_Class:
+          case TagTypeKind::Class:
             return "class <anonymous>";
-          case TTK_Union:
+          case TagTypeKind::Union:
             return "union <anonymous>";
-          case TTK_Enum:
+          case TagTypeKind::Enum:
             return "enum <anonymous>";
           }
         }
@@ -4167,14 +4168,14 @@ CXCursorKind clang::getCursorKindForDecl(const Decl *D) {
   default:
     if (const auto *TD = dyn_cast<TagDecl>(D)) {
       switch (TD->getTagKind()) {
-      case TTK_Interface: // fall through
-      case TTK_Struct:
+      case TagTypeKind::Interface: // fall through
+      case TagTypeKind::Struct:
         return CXCursor_StructDecl;
-      case TTK_Class:
+      case TagTypeKind::Class:
         return CXCursor_ClassDecl;
-      case TTK_Union:
+      case TagTypeKind::Union:
         return CXCursor_UnionDecl;
-      case TTK_Enum:
+      case TagTypeKind::Enum:
         return CXCursor_EnumDecl;
       }
     }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index b4affa7277f6b2b..396566a8f10a9b7 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -680,11 +680,16 @@ DeclSpec::TST Sema::isTagName(IdentifierInfo &II, Scope *S) {
   if (R.getResultKind() == LookupResult::Found)
     if (const TagDecl *TD = R.getAsSingle<TagDecl>()) {
       switch (TD->getTagKind()) {
-      case TTK_Struct: return DeclSpec::TST_struct;
-      case TTK_Interface: return DeclSpec::TST_interface;
-      case TTK_Union:  return DeclSpec::TST_union;
-      case TTK_Class:  return DeclSpec::TST_class;
-      case TTK_Enum:   return DeclSpec::TST_enum;
+      case TagTypeKind::Struct:
+        return DeclSpec::TST_struct;
+      case TagTypeKind::Interface:
+        return DeclSpec::TST_interface;
+      case TagTypeKind::Union:
+        return DeclSpec::TST_union;
+      case TagTypeKind::Class:
+        return DeclSpec::TST_class;
+      case TagTypeKind::Enum:
+        return DeclSpec::TST_enum;
       }
     }
 
@@ -860,25 +865,25 @@ static bool isTagTypeWithMissingTag(Sema &SemaRef, LookupResult &Result,
   if (TagDecl *Tag = R.getAsSingle<TagDecl>()) {
     StringRef FixItTagName;
     switch (Tag->getTagKind()) {
-      case TTK_Class:
-        FixItTagName = "class ";
-        break;
+    case TagTypeKind::Class:
+      FixItTagName = "class ";
+      break;
 
-      case TTK_Enum:
-        FixItTagName = "enum ";
-        break;
+    case TagTypeKind::Enum:
+      FixItTagName = "enum ";
+      break;
 
-      case TTK_Struct:
-        FixItTagName = "struct ";
-        break;
+    case TagTypeKind::Struct:
+      FixItTagName = "struct ";
+      break;
 
-      case TTK_Interface:
-        FixItTagName = "__interface ";
-        break;
+    case TagTypeKind::Interface:
+      FixItTagName = "__interface ";
+      break;
 
-      case TTK_Union:
-        FixItTagName = "union ";
-        break;
+    case TagTypeKind::Union:
+      FixItTagName = "union ";
+      break;
     }
 
     StringRef TagName = FixItTagName.drop_back();
@@ -5268,8 +5273,8 @@ Decl *Sema::ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS,
   if (DS.isModulePrivateSpecified() &&
       Tag && Tag->getDeclContext()->isFunctionOrMethod())
     Diag(DS.getModulePrivateSpecLoc(), diag::err_module_private_local_class)
-      << Tag->getTagKind()
-      << FixItHint::CreateRemoval(DS.getModulePrivateSpecLoc());
+        << llvm::to_underlying(Tag->getTagKind())
+        << FixItHint::CreateRemoval(DS.getModulePrivateSpecLoc());
 
   ActOnDocumentableDecl(TagD);
 
@@ -7667,14 +7672,15 @@ NamedDecl *Sema::ActOnVariableDeclarator(
           // members.
           Diag(D.getIdentifierLoc(),
                diag::err_static_data_member_not_allowed_in_local_class)
-            << Name << RD->getDeclName() << RD->getTagKind();
+              << Name << RD->getDeclName()
+              << llvm::to_underlying(RD->getTagKind());
         } else if (AnonStruct) {
           // C++ [class.static.data]p4: Unnamed classes and classes contained
           // directly or indirectly within unnamed classes shall not contain
           // static data members.
           Diag(D.getIdentifierLoc(),
                diag::err_static_data_member_not_allowed_in_anon_struct)
-            << Name << AnonStruct->getTagKind();
+              << Name << llvm::to_underlying(AnonStruct->getTagKind());
           Invalid = true;
         } else if (RD->isUnion()) {
           // C++98 [class.union]p1: If a union contains a static data member,
@@ -16766,9 +16772,12 @@ bool Sema::CheckEnumRedeclaration(SourceLocation EnumLoc, bool IsScoped,
 /// \returns diagnostic %select index.
 static unsigned getRedeclDiagFromTagKind(TagTypeKind Tag) {
   switch (Tag) {
-  case TTK_Struct: return 0;
-  case TTK_Interface: return 1;
-  case TTK_Class:  return 2;
+  case TagTypeKind::Struct:
+    return 0;
+  case TagTypeKind::Interface:
+    return 1;
+  case TagTypeKind::Class:
+    return 2;
   default: llvm_unreachable("Invalid tag kind for redecl diagnostic!");
   }
 }
@@ -16779,7 +16788,8 @@ static unsigned getRedeclDiagFromTagKind(TagTypeKind Tag) {
 /// \returns true iff the tag kind is compatible.
 static bool isClassCompatTagKind(TagTypeKind Tag)
 {
-  return Tag == TTK_Struct || Tag == TTK_Class || Tag == TTK_Interface;
+  return Tag == TagTypeKind::Struct || Tag == TagTypeKind::Class ||
+         Tag == TagTypeKind::Interface;
 }
 
 Sema::NonTagKind Sema::getNonTagTypeDeclKind(const Decl *PrevDecl,
@@ -16795,13 +16805,13 @@ Sema::NonTagKind Sema::getNonTagTypeDeclKind(const Decl *PrevDecl,
   else if (isa<TemplateTemplateParmDecl>(PrevDecl))
     return NTK_TemplateTemplateArgument;
   switch (TTK) {
-  case TTK_Struct:
-  case TTK_Interface:
-  case TTK_Class:
+  case TagTypeKind::Struct:
+  case TagTypeKind::Interface:
+  case TagTypeKind::Class:
     return getLangOpts().CPlusPlus ? NTK_NonClass : NTK_NonStruct;
-  case TTK_Union:
+  case TagTypeKind::Union:
     return NTK_NonUnion;
-  case TTK_Enum:
+  case TagTypeKind::Enum:
     return NTK_NonEnum;
   }
   llvm_unreachable("invalid TTK");
@@ -17037,7 +17047,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
             MatchTemplateParametersToScopeSpecifier(
                 KWLoc, NameLoc, SS, nullptr, TemplateParameterLists,
                 TUK == TUK_Friend, isMemberSpecialization, Invalid)) {
-      if (Kind == TTK_Enum) {
+      if (Kind == TagTypeKind::Enum) {
         Diag(KWLoc, diag::err_enum_template);
         return true;
       }
@@ -17075,7 +17085,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
   llvm::PointerUnion<const Type*, TypeSourceInfo*> EnumUnderlying;
   bool IsFixed = !UnderlyingType.isUnset() || ScopedEnum;
 
-  if (Kind == TTK_Enum) {
+  if (Kind == TagTypeKind::Enum) {
     if (UnderlyingType.isInvalid() || (!UnderlyingType.get() && ScopedEnum)) {
       // No underlying type explicitly specified, or we failed to parse the
       // type, default to int.
@@ -17125,7 +17135,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     SourceLocation Loc = NameLoc.isValid() ? NameLoc : KWLoc;
     TagDecl *New = nullptr;
 
-    if (Kind == TTK_Enum) {
+    if (Kind == TagTypeKind::Enum) {
       New = EnumDecl::Create(Context, SearchDC, KWLoc, Loc, Name, nullptr,
                              ScopedEnum, ScopedEnumUsesClassTag, IsFixed);
       // If this is an undefined enum, bail.
@@ -17218,7 +17228,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
 
       // A tag 'foo::bar' must already exist.
       Diag(NameLoc, diag::err_not_tag_in_scope)
-        << Kind << Name << DC << SS.getRange();
+          << llvm::to_underlying(Kind) << Name << DC << SS.getRange();
       Name = nullptr;
       Invalid = true;
       goto CreateNewDecl;
@@ -17478,9 +17488,9 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
         if (!isAcceptableTagRedeclaration(PrevTagDecl, Kind,
                                           TUK == TUK_Definition, KWLoc,
                                           Name)) {
-          bool SafeToContinue
-            = (PrevTagDecl->getTagKind() != TTK_Enum &&
-               Kind != TTK_Enum);
+          bool SafeToContinue =
+              (PrevTagDecl->getTagKind() != TagTypeKind::Enum &&
+               Kind != TagTypeKind::Enum);
           if (SafeToContinue)
             Diag(KWLoc, diag::err_use_with_wrong_tag)
               << Name
@@ -17500,7 +17510,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
           }
         }
 
-        if (Kind == TTK_Enum && PrevTagDecl->getTagKind() == TTK_Enum) {
+        if (Kind == TagTypeKind::Enum &&
+            PrevTagDecl->getTagKind() == TagTypeKind::Enum) {
           const EnumDecl *PrevEnum = cast<EnumDecl>(PrevTagDecl);
           if (TUK == TUK_Reference || TUK == TUK_Friend)
             return PrevTagDecl;
@@ -17670,8 +17681,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       if ((TUK == TUK_Reference || TUK == TUK_Friend) &&
           !Previous.isForRedeclaration()) {
         NonTagKind NTK = getNonTagTypeDeclKind(PrevDecl, Kind);
-        Diag(NameLoc, diag::err_tag_reference_non_tag) << PrevDecl << NTK
-                                                       << Kind;
+        Diag(NameLoc, diag::err_tag_reference_non_tag)
+            << PrevDecl << NTK << llvm::to_underlying(Kind);
         Diag(PrevDecl->getLocation(), diag::note_declared_at);
         Invalid = true;
 
@@ -17729,7 +17740,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
   // PrevDecl.
   TagDecl *New;
 
-  if (Kind == TTK_Enum) {
+  if (Kind == TagTypeKind::Enum) {
     // FIXME: Tag decls should be chained to any simultaneous vardecls, e.g.:
     // enum X { A, B, C } D;    D should chain to X.
     New = EnumDecl::Create(Context, SearchDC, KWLoc, Loc, Name,
@@ -17864,7 +17875,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
   // If we're declaring or defining a tag in function prototype scope in C,
   // note that this type can only be used within the function and add it to
   // the list of decls to inject into the function definition scope.
-  if ((Name || Kind == TTK_Enum) &&
+  if ((Name || Kind == TagTypeKind::Enum) &&
       getNonFieldDeclScope(S)->isFunctionPrototypeScope()) {
     if (getLangOpts().CPlusPlus) {
       // C++ [dcl.fct]p6:
@@ -19038,7 +19049,8 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
         unsigned DiagID = 0;
         if (!Record->isUnion() && !IsLastField) {
           Diag(FD->getLocation(), diag::err_flexible_array_not_at_end)
-            << FD->getDeclName() << FD->getType() << Record->getTagKind();
+              << FD->getDeclName() << FD->getType()
+              << llvm::to_underlying(Record->getTagKind());
           Diag((*(i + 1))->getLocation(), diag::note_next_field_declaration);
           FD->setInvalidDecl();
           EnclosingDecl->setInvalidDecl();
@@ -19057,8 +19069,8 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
                              : diag::err_flexible_array_empty_aggregate;
 
         if (DiagID)
-          Diag(FD->getLocation(), DiagID) << FD->getDeclName()
-                                          << Record->getTagKind();
+          Diag(FD->getLocation(), DiagID)
+              << FD->getDeclName() << llvm::to_underlying(Record->getTagKind());
         // While the layout of types that contain virtual bases is not specified
         // by the C++ standard, both the Itanium and Microsoft C++ ABIs place
         // virtual bases after the derived members.  This would make a flexible
@@ -19066,10 +19078,10 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
         // of the type.
         if (CXXRecord && CXXRecord->getNumVBases() != 0)
           Diag(FD->getLocation(), diag::err_flexible_array_virtual_base)
-              << FD->getDeclName() << Record->getTagKind();
+              << FD->getDeclName() << llvm::to_underlying(Record->getTagKind());
         if (!getLangOpts().C99)
           Diag(FD->getLocation(), diag::ext_c99_flexible_array_member)
-            << FD->getDeclName() << Record->getTagKind();
+              << FD->getDeclName() << llvm::to_underlying(Record->getTagKind());
 
         // If the element type has a non-trivial destructor, we would not
         // implicitly destroy the elements, so disallow it for now.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 842a01a88cd3c6d..cdb769a883550d0 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -189,7 +189,7 @@ static inline bool isCFStringType(QualType T, ASTContext &Ctx) {
     return false;
 
   const RecordDecl *RD = RT->getDecl();
-  if (RD->getTagKind() != TTK_Struct)
+  if (RD->getTagKind() != TagTypeKind::Struct)
     return false;
 
   return RD->getIdentifier() == &Ctx.Idents.get("__CFString");
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index be79defbbfac6f1..397b7a00e453126 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -1781,9 +1781,12 @@ static bool CheckConstexprReturnType(Sema &SemaRef, const FunctionDecl *FD,
 /// \returns diagnostic %select index.
 static unsigned getRecordDiagFromTagKind(TagTypeKind Tag) {
   switch (Tag) {
-  case TTK_Struct: return 0;
-  case TTK_Interface: return 1;
-  case TTK_Class:  return 2;
+  case TagTypeKind::Struct:
+    return 0;
+  case TagTypeKind::Interface:
+    return 1;
+  case TagTypeKind::Class:
+    return 2;
   default: llvm_unreachable("Invalid tag kind for record diagnostic!");
   }
 }
@@ -2680,7 +2683,7 @@ Sema::CheckBaseSpecifier(CXXRecordDecl *Class,
                          TypeSourceInfo *TInfo,
                          SourceLocation EllipsisLoc) {
   // In HLSL, unspecified class access is public rather than private.
-  if (getLangOpts().HLSL && Class->getTagKind() == TTK_Class &&
+  if (getLangOpts().HLSL && Class->getTagKind() == TagTypeKind::Class &&
       Access == AS_none)
     Access = AS_public;
 
@@ -2733,9 +2736,9 @@ Sema::CheckBaseSpecifier(CXXRecordDecl *Class,
     // emitted.
     if (!Class->getTypeForDecl()->isDependentType())
       Class->setInvalidDecl();
-    return new (Context) CXXBaseSpecifier(SpecifierRange, Virtual,
-                                          Class->getTagKind() == TTK_Class,
-                                          Access, TInfo, EllipsisLoc);
+    return new (Context) CXXBaseSpecifier(
+        SpecifierRange, Virtual, Class->getTagKind() == TagTypeKind::Class,
+        Access, TInfo, EllipsisLoc);
   }
 
   // Base specifiers must be record types.
@@ -2821,9 +2824,9 @@ Sema::CheckBaseSpecifier(CXXRecordDecl *Class,
     Class->setInvalidDecl();
 
   // Create the base specifier.
-  return new (Context) CXXBaseSpecifier(SpecifierRange, Virtual,
-                                        Class->getTagKind() == TTK_Class,
-                                        Access, TInfo, EllipsisLoc);
+  return new (Context) CXXBaseSpecifier(
+      SpecifierRange, Virtual, Class->getTagKind() == TagTypeKind::Class,
+      Access, TInfo, EllipsisLoc);
 }
 
 /// ActOnBaseSpecifier - Parsed a base specifier. A base specifier is
@@ -7010,7 +7013,7 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) {
           (F->getType().isConstQualified() && F->getType()->isScalarType())) {
         if (!Complained) {
           Diag(Record->getLocation(), diag::warn_no_constructor_for_refconst)
-            << Record->getTagKind() << Record;
+              << llvm::to_underlying(Record->getTagKind()) << Record;
           Complained = true;
         }
 
diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp
index 8685838157b5cfa..cdfa6ad3f281a43 100644
--- a/clang/lib/Sema/SemaDeclObjC.cpp
+++ b/clang/lib/Sema/SemaDeclObjC.cpp
@@ -3884,7 +3884,7 @@ static void DiagnoseVariableSizedIvars(Sema &S, ObjCContainerDecl *OCD) {
     if (IvarTy->isIncompleteArrayType()) {
       S.Diag(ivar->getLocation(), diag::err_flexible_array_not_at_end)
           << ivar->getDeclName() << IvarTy
-          << TTK_Class; // Use "class" for Obj-C.
+          << llvm::to_underlying(TagTypeKind::Class); // Use "class" for Obj-C.
       IsInvalidIvar = true;
     } else if (const RecordType *RecordTy = IvarTy->getAs<RecordType>()) {
       if (RecordTy->getDecl()->hasFlexibleArrayMember()) {
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index ea286c9709c13ff..093c57d64a7124d 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -3035,11 +3035,10 @@ void Sema::DeclareGlobalNewDelete() {
   if (!StdBadAlloc && !getLangOpts().CPlusPlus11) {
     // The "std::bad_alloc" class has not yet been declared, so build it
     // implicitly.
-    StdBadAlloc = CXXRecordDecl::Create(Context, TTK_Class,
-                                        getOrCreateStdNamespace(),
-                                        SourceLocation(), SourceLocation(),
-                                      &PP.getIdentifierTable().get("bad_alloc"),
-                                        nullptr);
+    StdBadAlloc = CXXRecordDecl::Create(
+        Context, TagTypeKind::Class, getOrCreateStdNamespace(),
+        SourceLocation(), SourceLocation(),
+        &PP.getIdentifierTable().get("bad_alloc"), nullptr);
     getStdBadAlloc()->setImplicit(true);
 
     // The implicitly declared "std::bad_alloc" should live in global module
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 333226963aeac5e..5ef0aaa40ba1f28 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -4692,10 +4692,11 @@ Sema::CreateCapturedStmtRecordDecl(CapturedDecl *&CD, SourceLocation Loc,
 
   RecordDecl *RD = nullptr;
   if (getLangOpts().CPlusPlus)
-    RD = CXXRecordDecl::Create(Context, TTK_Struct, DC, Loc, Loc,
+    RD = CXXRecordDecl::Create(Context, TagTypeKind::Struct, DC, Loc, Loc,
                                /*Id=*/nullptr);
   else
-    RD = RecordDecl::Create(Context, TTK_Struct, DC, Loc, Loc, /*Id=*/nullptr);
+    RD = RecordDecl::Create(Context, TagTypeKind::Struct, DC, Loc, Loc,
+                            /*Id=*/nullptr);
 
   RD->setCapturedRecord();
   DC->addDecl(RD);
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 4d6b2376ba78669..ee354862212803f 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1841,7 +1841,8 @@ DeclResult Sema::CheckClassTemplate(
     return true;
 
   TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
-  assert(Kind != TTK_Enum && "can't build template of enumerated type");
+  assert(Kind != TagTypeKind::Enum &&
+         "can't build template of enumerated type");
 
   // There is no such thing as an unnamed class template.
   if (!Name) {
@@ -4292,7 +4293,7 @@ TypeResult Sema::ActOnTagTemplateIdType(TagUseKind TUK,
     //   resolves to an alias template specialization, the
     //   elaborated-type-specifier is ill-formed.
     Diag(TemplateLoc, diag::err_tag_reference_non_tag)
-        << TAT << NTK_TypeAliasTemplate << TagKind;
+        << TAT << NTK_TypeAliasTemplate << llvm::to_underlying(TagKind);
     Diag(TAT->getLocation(), diag::note_declared_at);
   }
 
@@ -8722,7 +8723,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   // Check that the specialization uses the same tag kind as the
   // original template.
   TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
-  assert(Kind != TTK_Enum && "Invalid enum tag in class template spec!");
+  assert(Kind != TagTypeKind::Enum &&
+         "Invalid enum tag in class template spec!");
   if (!isAcceptableTagRedeclaration(ClassTemplate->getTemplatedDecl(),
                                     Kind, TUK == TUK_Definition, KWLoc,
                                     ClassTemplate->getIdentifier())) {
@@ -9968,14 +9970,15 @@ DeclResult Sema::ActOnExplicitInstantiation(
   // Check that the specialization uses the same tag kind as the
   // original template.
   TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
-  assert(Kind != TTK_Enum &&
+  assert(Kind != TagTypeKind::Enum &&
          "Invalid enum tag in class template explicit instantiation!");
 
   ClassTemplateDecl *ClassTemplate = dyn_cast<ClassTemplateDecl>(TD);
 
   if (!ClassTemplate) {
     NonTagKind NTK = getNonTagTypeDeclKind(TD, Kind);
-    Diag(TemplateNameLoc, diag::err_tag_reference_non_tag) << TD << NTK << Kind;
+    Diag(TemplateNameLoc, diag::err_tag_reference_non_tag)
+        << TD << NTK << llvm::to_underlying(Kind);
     Diag(TD->getLocation(), diag::note_previous_use);
     return true;
   }
@@ -10800,7 +10803,8 @@ Sema::ActOnDependentTag(Scope *S, unsigned TagSpec, TagUseKind TUK,
 
   if (TUK == TUK_Declaration || TUK == TUK_Definition) {
     Diag(NameLoc, diag::err_dependent_tag_decl)
-      << (TUK == TUK_Definition) << Kind << SS.getRange();
+        << (TUK == TUK_Definition) << llvm::to_underlying(Kind)
+        << SS.getRange();
     return true;
   }
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 8edcbf4709a2301..011356e08a04297 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1662,8 +1662,8 @@ Decl *TemplateDeclInstantiator::VisitClassTemplateDecl(ClassTemplateDecl *D) {
 
     if (!PrevClassTemplate && QualifierLoc) {
       SemaRef.Diag(Pattern->getLocation(), diag::err_not_tag_in_scope)
-        << D->getTemplatedDecl()->getTagKind() << Pattern->getDeclName() << DC
-        << QualifierLoc.getSourceRange();
+          << llvm::to_underlying(D->getTemplatedDecl()->getTagKind())
+          << Pattern->getDeclName() << DC << QualifierLoc.getSourceRange();
       return nullptr;
     }
   }
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index a46deed8e7c58b4..560feafa1857cb3 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -3667,11 +3667,20 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state,
         Error = 6; // Interface member.
       } else {
         switch (cast<TagDecl>(SemaRef.CurContext)->getTagKind()) {
-        case TTK_Enum: llvm_unreachable("unhandled tag kind");
-        case TTK_Struct: Error = Cxx ? 1 : 2; /* Struct member */ break;
-        case TTK_Union:  Error = Cxx ? 3 : 4; /* Union member */ break;
-        case TTK_Class:  Error = 5; /* Class member */ break;
-        case TTK_Interface: Error = 6; /* Interface member */ break;
+        case TagTypeKind::Enum:
+          llvm_unreachable("unhandled tag kind");
+        case TagTypeKind::Struct:
+          Error = Cxx ? 1 : 2; /* Struct member */
+          break;
+        case TagTypeKind::Union:
+          Error = Cxx ? 3 : 4; /* Union member */
+          break;
+        case TagTypeKind::Class:
+          Error = 5; /* Class member */
+          break;
+        case TagTypeKind::Interface:
+          Error = 6; /* Interface member */
+          break;
         }
       }
       if (D.getDeclSpec().isFriendSpecified())
@@ -4413,7 +4422,7 @@ bool Sema::isCFError(RecordDecl *RD) {
   // NSError. CFErrorRef used to be declared with "objc_bridge" but is now
   // declared with "objc_bridge_mutable", so look for either one of the two
   // attributes.
-  if (RD->getTagKind() == TTK_Struct) {
+  if (RD->getTagKind() == TagTypeKind::Struct) {
     IdentifierInfo *bridgedType = nullptr;
     if (auto bridgeAttr = RD->getAttr<ObjCBridgeAttr>())
       bridgedType = bridgeAttr->getBridgedType();
@@ -9417,9 +9426,12 @@ bool Sema::RequireCompleteType(SourceLocation Loc, QualType T,
 /// \returns diagnostic %select index.
 static unsigned getLiteralDiagFromTagKind(TagTypeKind Tag) {
   switch (Tag) {
-  case TTK_Struct: return 0;
-  case TTK_Interface: return 1;
-  case TTK_Class:  return 2;
+  case TagTypeKind::Struct:
+    return 0;
+  case TagTypeKind::Interface:
+    return 1;
+  case TagTypeKind::Class:
+    return 2;
   default: llvm_unreachable("Invalid tag kind for literal type diagnostic!");
   }
 }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 22357d55d37331b..ac12b39a5978b20 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1209,14 +1209,15 @@ class TreeTransform {
         case LookupResult::FoundUnresolvedValue: {
           NamedDecl *SomeDecl = Result.getRepresentativeDecl();
           Sema::NonTagKind NTK = SemaRef.getNonTagTypeDeclKind(SomeDecl, Kind);
-          SemaRef.Diag(IdLoc, diag::err_tag_reference_non_tag) << SomeDecl
-                                                               << NTK << Kind;
+          SemaRef.Diag(IdLoc, diag::err_tag_reference_non_tag)
+              << SomeDecl << NTK << llvm::to_underlying(Kind);
           SemaRef.Diag(SomeDecl->getLocation(), diag::note_declared_at);
           break;
         }
         default:
           SemaRef.Diag(IdLoc, diag::err_not_tag_in_scope)
-              << Kind << Id << DC << QualifierLoc.getSourceRange();
+              << llvm::to_underlying(Kind) << Id << DC
+              << QualifierLoc.getSourceRange();
           break;
       }
       return QualType();
@@ -7029,7 +7030,8 @@ TreeTransform<Derived>::TransformElaboratedType(TypeLocBuilder &TLB,
         SemaRef.Diag(TL.getNamedTypeLoc().getBeginLoc(),
                      diag::err_tag_reference_non_tag)
             << TAT << Sema::NTK_TypeAliasTemplate
-            << ElaboratedType::getTagTypeKindForKeyword(T->getKeyword());
+            << llvm::to_underlying(
+                   ElaboratedType::getTagTypeKindForKeyword(T->getKeyword()));
         SemaRef.Diag(TAT->getLocation(), diag::note_declared_at);
       }
     }
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index a63911cb4adfba5..79817b3fb1ec3a0 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -757,7 +757,8 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitTagDecl(TagDecl *TD) {
   TD->IdentifierNamespace = Record.readInt();
 
   BitsUnpacker TagDeclBits(Record.readInt());
-  TD->setTagKind((TagDecl::TagKind)TagDeclBits.getNextBits(/*Width=*/3));
+  TD->setTagKind(
+      static_cast<TagTypeKind>(TagDeclBits.getNextBits(/*Width=*/3)));
   TD->setCompleteDefinition(TagDeclBits.getNextBit());
   TD->setEmbeddedInDeclarator(TagDeclBits.getNextBit());
   TD->setFreeStanding(TagDeclBits.getNextBit());
@@ -4587,7 +4588,7 @@ void ASTDeclReader::UpdateDecl(Decl *D,
         }
       }
 
-      RD->setTagKind((TagTypeKind)Record.readInt());
+      RD->setTagKind(static_cast<TagTypeKind>(Record.readInt()));
       RD->setLocation(readSourceLocation());
       RD->setLocStart(readSourceLocation());
       RD->setBraceRange(readSourceRange());
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 2fe12f7ff300f02..1e86566d81fbc02 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5333,7 +5333,7 @@ void ASTWriter::WriteDeclUpdatesBlocks(RecordDataImpl &OffsetsRecord) {
             Record.push_back(false);
           }
         }
-        Record.push_back(RD->getTagKind());
+        Record.push_back(llvm::to_underlying(RD->getTagKind()));
         Record.AddSourceLocation(RD->getLocation());
         Record.AddSourceLocation(RD->getBeginLoc());
         Record.AddSourceRange(RD->getBraceRange());
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index b4438e4cf6a0c55..a06df18e898186b 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -445,7 +445,7 @@ void ASTDeclWriter::VisitTagDecl(TagDecl *D) {
   Record.push_back(D->getIdentifierNamespace());
 
   BitsPacker TagDeclBits;
-  TagDeclBits.addBits(D->getTagKind(), /*BitWidth=*/3);
+  TagDeclBits.addBits(llvm::to_underlying(D->getTagKind()), /*BitWidth=*/3);
   TagDeclBits.addBit(!isa<CXXRecordDecl>(D) ? D->isCompleteDefinition() : 0);
   TagDeclBits.addBit(D->isEmbeddedInDeclarator());
   TagDeclBits.addBit(D->isFreeStanding());
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
index 66d8588e2531589..d10924776f107de 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/NoUncountedMembersChecker.cpp
@@ -104,7 +104,7 @@ class NoUncountedMemberChecker
 
     const auto Kind = RD->getTagKind();
     // FIMXE: Should we check union members too?
-    if (Kind != TTK_Struct && Kind != TTK_Class)
+    if (Kind != TagTypeKind::Struct && Kind != TagTypeKind::Class)
       return true;
 
     // Ignore CXXRecords that come from system headers.
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
index 48dcfc4a3c4645d..bd7c50ccfa9c4a6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
@@ -114,7 +114,7 @@ class RefCntblBaseVirtualDtorChecker
       return true;
 
     const auto Kind = RD->getTagKind();
-    if (Kind != TTK_Struct && Kind != TTK_Class)
+    if (Kind != TagTypeKind::Struct && Kind != TagTypeKind::Class)
       return true;
 
     // Ignore CXXRecords that come from system headers.
diff --git a/clang/tools/libclang/CIndexCXX.cpp b/clang/tools/libclang/CIndexCXX.cpp
index a06fe7ba4af56c2..ea6f97d39644e18 100644
--- a/clang/tools/libclang/CIndexCXX.cpp
+++ b/clang/tools/libclang/CIndexCXX.cpp
@@ -63,11 +63,15 @@ enum CXCursorKind clang_getTemplateCursorKind(CXCursor C) {
           = dyn_cast_or_null<ClassTemplatePartialSpecializationDecl>(
                                                             getCursorDecl(C))) {
       switch (PartialSpec->getTagKind()) {
-      case TTK_Interface:
-      case TTK_Struct: return CXCursor_StructDecl;
-      case TTK_Class: return CXCursor_ClassDecl;
-      case TTK_Union: return CXCursor_UnionDecl;
-      case TTK_Enum: return CXCursor_NoDeclFound;
+      case TagTypeKind::Interface:
+      case TagTypeKind::Struct:
+        return CXCursor_StructDecl;
+      case TagTypeKind::Class:
+        return CXCursor_ClassDecl;
+      case TagTypeKind::Union:
+        return CXCursor_UnionDecl;
+      case TagTypeKind::Enum:
+        return CXCursor_NoDeclFound;
       }
     }
     break;
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 0cbec5cf9ba062f..5f4d8d040772cb1 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -7705,8 +7705,9 @@ TEST_P(ImportWithExternalSource, CompleteRecordBeforeImporting) {
 
   // Create a dummy class by hand with external lexical storage.
   IdentifierInfo &Ident = Context.Idents.get("test_class");
-  auto *Record = CXXRecordDecl::Create(
-      Context, TTK_Class, FromTU, SourceLocation(), SourceLocation(), &Ident);
+  auto *Record =
+      CXXRecordDecl::Create(Context, TagTypeKind::Class, FromTU,
+                            SourceLocation(), SourceLocation(), &Ident);
   Record->setHasExternalLexicalStorage();
   FromTU->addDecl(Record);
 
diff --git a/clang/utils/ClangVisualizers/clang.natvis b/clang/utils/ClangVisualizers/clang.natvis
index cbb63dc08de2338..2d1ad16797f8177 100644
--- a/clang/utils/ClangVisualizers/clang.natvis
+++ b/clang/utils/ClangVisualizers/clang.natvis
@@ -347,11 +347,11 @@ For later versions of Visual Studio, no setup is required-->
     <DisplayString IncludeView="implicit"></DisplayString>
     <DisplayString IncludeView="modifiers">{*this,view(implicit)nd}</DisplayString>
     <DisplayString IncludeView="cpp">{*this,view(modifiers)}{Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::TTK_Struct">{*this,view(modifiers)nd}struct {Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::TTK_Interface">{*this,view(modifiers)nd}interface {Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::TTK_Union">{*this,view(modifiers)nd}union {Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::TTK_Class">{*this,view(modifiers)nd}class {Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::TTK_Enum">{*this,view(modifiers)nd}enum {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Struct">{*this,view(modifiers)nd}struct {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Interface">{*this,view(modifiers)nd}interface {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Union">{*this,view(modifiers)nd}union {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Class">{*this,view(modifiers)nd}class {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Enum">{*this,view(modifiers)nd}enum {Name,view(cpp)}</DisplayString>
     <Expand>
       <ExpandedItem>(clang::DeclContext *)this</ExpandedItem>
     </Expand>
diff --git a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp
index 2e927eb8d856918..605c79cbd9b5544 100644
--- a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp
+++ b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp
@@ -78,7 +78,8 @@ static CompilerType GetLLDBNSPairType(TargetSP target_sp) {
   if (!compiler_type) {
     compiler_type = scratch_ts_sp->CreateRecordType(
         nullptr, OptionalClangModuleID(), lldb::eAccessPublic,
-        g_lldb_autogen_nspair, clang::TTK_Struct, lldb::eLanguageTypeC);
+        g_lldb_autogen_nspair, llvm::to_underlying(clang::TagTypeKind::Struct),
+        lldb::eLanguageTypeC);
 
     if (compiler_type) {
       TypeSystemClang::StartTagDeclarationDefinition(compiler_type);
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
index 2cc5319c84bbeb8..ca582cb1d5a46f4 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
@@ -81,13 +81,13 @@ AppleObjCTypeEncodingParser::ReadStructElement(TypeSystemClang &ast_ctx,
 clang::QualType AppleObjCTypeEncodingParser::BuildStruct(
     TypeSystemClang &ast_ctx, StringLexer &type, bool for_expression) {
   return BuildAggregate(ast_ctx, type, for_expression, _C_STRUCT_B, _C_STRUCT_E,
-                        clang::TTK_Struct);
+                        llvm::to_underlying(clang::TagTypeKind::Struct));
 }
 
 clang::QualType AppleObjCTypeEncodingParser::BuildUnion(
     TypeSystemClang &ast_ctx, StringLexer &type, bool for_expression) {
   return BuildAggregate(ast_ctx, type, for_expression, _C_UNION_B, _C_UNION_E,
-                        clang::TTK_Union);
+                        llvm::to_underlying(clang::TagTypeKind::Union));
 }
 
 clang::QualType AppleObjCTypeEncodingParser::BuildAggregate(
diff --git a/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp b/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp
index 7abc71a1c53fe28..d7584be2b95eef8 100644
--- a/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp
+++ b/lldb/source/Plugins/Platform/FreeBSD/PlatformFreeBSD.cpp
@@ -206,7 +206,7 @@ CompilerType PlatformFreeBSD::GetSiginfoType(const llvm::Triple &triple) {
 
   CompilerType sigval_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_sigval_t",
-      clang::TTK_Union, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(sigval_type);
   ast->AddFieldToRecordType(sigval_type, "sival_int", int_type,
                             lldb::eAccessPublic, 0);
@@ -217,7 +217,7 @@ CompilerType PlatformFreeBSD::GetSiginfoType(const llvm::Triple &triple) {
   // siginfo_t
   CompilerType siginfo_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_siginfo_t",
-      clang::TTK_Struct, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Struct), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(siginfo_type);
   ast->AddFieldToRecordType(siginfo_type, "si_signo", int_type,
                             lldb::eAccessPublic, 0);
@@ -239,7 +239,7 @@ CompilerType PlatformFreeBSD::GetSiginfoType(const llvm::Triple &triple) {
   // union used to hold the signal data
   CompilerType union_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
-      clang::TTK_Union, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(union_type);
 
   ast->AddFieldToRecordType(
diff --git a/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp b/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp
index dd50bfc2dc04a37..13465986f49c53d 100644
--- a/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp
+++ b/lldb/source/Plugins/Platform/Linux/PlatformLinux.cpp
@@ -348,7 +348,7 @@ CompilerType PlatformLinux::GetSiginfoType(const llvm::Triple &triple) {
 
   CompilerType sigval_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_sigval_t",
-      clang::TTK_Union, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(sigval_type);
   ast->AddFieldToRecordType(sigval_type, "sival_int", int_type,
                             lldb::eAccessPublic, 0);
@@ -358,7 +358,7 @@ CompilerType PlatformLinux::GetSiginfoType(const llvm::Triple &triple) {
 
   CompilerType sigfault_bounds_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
-      clang::TTK_Union, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(sigfault_bounds_type);
   ast->AddFieldToRecordType(
       sigfault_bounds_type, "_addr_bnd",
@@ -375,7 +375,7 @@ CompilerType PlatformLinux::GetSiginfoType(const llvm::Triple &triple) {
   // siginfo_t
   CompilerType siginfo_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_siginfo_t",
-      clang::TTK_Struct, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Struct), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(siginfo_type);
   ast->AddFieldToRecordType(siginfo_type, "si_signo", int_type,
                             lldb::eAccessPublic, 0);
@@ -400,7 +400,7 @@ CompilerType PlatformLinux::GetSiginfoType(const llvm::Triple &triple) {
   // union used to hold the signal data
   CompilerType union_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
-      clang::TTK_Union, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(union_type);
 
   ast->AddFieldToRecordType(
diff --git a/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp b/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
index 393519d708d3668..ce81aab55706238 100644
--- a/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
+++ b/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
@@ -228,7 +228,7 @@ CompilerType PlatformNetBSD::GetSiginfoType(const llvm::Triple &triple) {
 
   CompilerType sigval_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_sigval_t",
-      clang::TTK_Union, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(sigval_type);
   ast->AddFieldToRecordType(sigval_type, "sival_int", int_type,
                             lldb::eAccessPublic, 0);
@@ -238,7 +238,7 @@ CompilerType PlatformNetBSD::GetSiginfoType(const llvm::Triple &triple) {
 
   CompilerType ptrace_option_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
-      clang::TTK_Union, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(ptrace_option_type);
   ast->AddFieldToRecordType(ptrace_option_type, "_pe_other_pid", pid_type,
                             lldb::eAccessPublic, 0);
@@ -249,13 +249,13 @@ CompilerType PlatformNetBSD::GetSiginfoType(const llvm::Triple &triple) {
   // siginfo_t
   CompilerType siginfo_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_siginfo_t",
-      clang::TTK_Union, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(siginfo_type);
 
   // struct _ksiginfo
   CompilerType ksiginfo_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
-      clang::TTK_Struct, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Struct), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(ksiginfo_type);
   ast->AddFieldToRecordType(ksiginfo_type, "_signo", int_type,
                             lldb::eAccessPublic, 0);
@@ -272,7 +272,7 @@ CompilerType PlatformNetBSD::GetSiginfoType(const llvm::Triple &triple) {
   // union used to hold the signal data
   CompilerType union_type = ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
-      clang::TTK_Union, lldb::eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
   ast->StartTagDeclarationDefinition(union_type);
 
   ast->AddFieldToRecordType(
diff --git a/lldb/source/Plugins/RegisterTypeBuilder/RegisterTypeBuilderClang.cpp b/lldb/source/Plugins/RegisterTypeBuilder/RegisterTypeBuilderClang.cpp
index aeb54ef9ee24b10..067768537c06848 100644
--- a/lldb/source/Plugins/RegisterTypeBuilder/RegisterTypeBuilderClang.cpp
+++ b/lldb/source/Plugins/RegisterTypeBuilder/RegisterTypeBuilderClang.cpp
@@ -60,7 +60,8 @@ CompilerType RegisterTypeBuilderClang::GetRegisterType(
 
     fields_type = type_system->CreateRecordType(
         nullptr, OptionalClangModuleID(), lldb::eAccessPublic,
-        register_type_name, clang::TTK_Struct, lldb::eLanguageTypeC);
+        register_type_name, llvm::to_underlying(clang::TagTypeKind::Struct),
+        lldb::eLanguageTypeC);
     type_system->StartTagDeclarationDefinition(fields_type);
 
     // We assume that RegisterFlags has padded and sorted the fields
diff --git a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
index 62353be170cfb0c..7a2b4c00eedf374 100644
--- a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
+++ b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
@@ -320,12 +320,12 @@ static uint32_t GetBytes(uint32_t bits) { return bits / sizeof(unsigned); }
 static clang::TagTypeKind TranslateRecordKind(CTFType::Kind type) {
   switch (type) {
   case CTFType::Kind::eStruct:
-    return clang::TTK_Struct;
+    return clang::TagTypeKind::Struct;
   case CTFType::Kind::eUnion:
-    return clang::TTK_Union;
+    return clang::TagTypeKind::Union;
   default:
     lldbassert(false && "Invalid record kind!");
-    return clang::TTK_Struct;
+    return clang::TagTypeKind::Struct;
   }
 }
 
@@ -503,9 +503,9 @@ SymbolFileCTF::CreateFunction(const CTFFunction &ctf_function) {
 llvm::Expected<lldb::TypeSP>
 SymbolFileCTF::CreateRecord(const CTFRecord &ctf_record) {
   const clang::TagTypeKind tag_kind = TranslateRecordKind(ctf_record.kind);
-  CompilerType record_type =
-      m_ast->CreateRecordType(nullptr, OptionalClangModuleID(), eAccessPublic,
-                              ctf_record.name.data(), tag_kind, eLanguageTypeC);
+  CompilerType record_type = m_ast->CreateRecordType(
+      nullptr, OptionalClangModuleID(), eAccessPublic, ctf_record.name.data(),
+      llvm::to_underlying(tag_kind), eLanguageTypeC);
   m_compiler_types[record_type.GetOpaqueQualType()] = &ctf_record;
   Declaration decl;
   return MakeType(ctf_record.uid, ConstString(ctf_record.name), ctf_record.size,
@@ -562,7 +562,7 @@ llvm::Expected<lldb::TypeSP>
 SymbolFileCTF::CreateForward(const CTFForward &ctf_forward) {
   CompilerType forward_compiler_type = m_ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), eAccessPublic, ctf_forward.name,
-      clang::TTK_Struct, eLanguageTypeC);
+      llvm::to_underlying(clang::TagTypeKind::Struct), eLanguageTypeC);
   Declaration decl;
   return MakeType(ctf_forward.uid, ConstString(ctf_forward.name), 0, nullptr,
                   LLDB_INVALID_UID, Type::eEncodingIsUID, decl,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 696708d3fc7cf5d..3174c18c97d888c 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1637,13 +1637,13 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
   int tag_decl_kind = -1;
   AccessType default_accessibility = eAccessNone;
   if (tag == DW_TAG_structure_type) {
-    tag_decl_kind = clang::TTK_Struct;
+    tag_decl_kind = llvm::to_underlying(clang::TagTypeKind::Struct);
     default_accessibility = eAccessPublic;
   } else if (tag == DW_TAG_union_type) {
-    tag_decl_kind = clang::TTK_Union;
+    tag_decl_kind = llvm::to_underlying(clang::TagTypeKind::Union);
     default_accessibility = eAccessPublic;
   } else if (tag == DW_TAG_class_type) {
-    tag_decl_kind = clang::TTK_Class;
+    tag_decl_kind = llvm::to_underlying(clang::TagTypeKind::Class);
     default_accessibility = eAccessPrivate;
   }
 
@@ -3852,7 +3852,7 @@ void DWARFASTParserClang::ParseRustVariantPart(
       decl_context, OptionalClangModuleID(), lldb::eAccessPublic,
       std::string(
           llvm::formatv("{0}$Inner", class_clang_type.GetTypeName(false))),
-      clang::TTK_Union, lldb::eLanguageTypeRust);
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeRust);
   m_ast.StartTagDeclarationDefinition(inner_holder);
   m_ast.SetIsPacked(inner_holder);
 
@@ -3866,7 +3866,8 @@ void DWARFASTParserClang::ParseRustVariantPart(
         m_ast.GetDeclContextForType(inner_holder), OptionalClangModuleID(),
         lldb::eAccessPublic,
         std::string(llvm::formatv("{0}$Variant", member.GetName())),
-        clang::TTK_Struct, lldb::eLanguageTypeRust);
+        llvm::to_underlying(clang::TagTypeKind::Struct),
+        lldb::eLanguageTypeRust);
 
     m_ast.StartTagDeclarationDefinition(field_type);
     auto offset = member.byte_offset;
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp
index c7ff25e904abeb9..5b690ead1e8dea7 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp
@@ -99,18 +99,18 @@ struct CreateMethodDecl : public TypeVisitorCallbacks {
 static clang::TagTypeKind TranslateUdtKind(const TagRecord &cr) {
   switch (cr.Kind) {
   case TypeRecordKind::Class:
-    return clang::TTK_Class;
+    return clang::TagTypeKind::Class;
   case TypeRecordKind::Struct:
-    return clang::TTK_Struct;
+    return clang::TagTypeKind::Struct;
   case TypeRecordKind::Union:
-    return clang::TTK_Union;
+    return clang::TagTypeKind::Union;
   case TypeRecordKind::Interface:
-    return clang::TTK_Interface;
+    return clang::TagTypeKind::Interface;
   case TypeRecordKind::Enum:
-    return clang::TTK_Enum;
+    return clang::TagTypeKind::Enum;
   default:
     lldbassert(false && "Invalid tag record kind!");
-    return clang::TTK_Struct;
+    return clang::TagTypeKind::Struct;
   }
 }
 
@@ -608,16 +608,17 @@ clang::QualType PdbAstBuilder::CreateRecordType(PdbTypeSymId id,
     return {};
 
   clang::TagTypeKind ttk = TranslateUdtKind(record);
-  lldb::AccessType access =
-      (ttk == clang::TTK_Class) ? lldb::eAccessPrivate : lldb::eAccessPublic;
+  lldb::AccessType access = (ttk == clang::TagTypeKind::Class)
+                                ? lldb::eAccessPrivate
+                                : lldb::eAccessPublic;
 
   ClangASTMetadata metadata;
   metadata.SetUserID(toOpaqueUid(id));
   metadata.SetIsDynamicCXXType(false);
 
-  CompilerType ct =
-      m_clang.CreateRecordType(context, OptionalClangModuleID(), access, uname,
-                               ttk, lldb::eLanguageTypeC_plus_plus, &metadata);
+  CompilerType ct = m_clang.CreateRecordType(
+      context, OptionalClangModuleID(), access, uname, llvm::to_underlying(ttk),
+      lldb::eLanguageTypeC_plus_plus, &metadata);
 
   lldbassert(ct.IsValid());
 
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
index cf3868d077c3f03..fab3ca989c0ec6b 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
@@ -356,14 +356,14 @@ UdtRecordCompleter::AddMember(TypeSystemClang &clang, Member *field,
   case Member::Struct:
   case Member::Union: {
     clang::TagTypeKind kind = field->kind == Member::Struct
-                                  ? clang::TagTypeKind::TTK_Struct
-                                  : clang::TagTypeKind::TTK_Union;
+                                  ? clang::TagTypeKind::Struct
+                                  : clang::TagTypeKind::Union;
     ClangASTMetadata metadata;
     metadata.SetUserID(pdb->anonymous_id);
     metadata.SetIsDynamicCXXType(false);
     CompilerType record_ct = clang.CreateRecordType(
-        parent_decl_ctx, OptionalClangModuleID(), lldb::eAccessPublic, "", kind,
-        lldb::eLanguageTypeC_plus_plus, &metadata);
+        parent_decl_ctx, OptionalClangModuleID(), lldb::eAccessPublic, "",
+        llvm::to_underlying(kind), lldb::eLanguageTypeC_plus_plus, &metadata);
     TypeSystemClang::StartTagDeclarationDefinition(record_ct);
     ClangASTImporter::LayoutInfo layout;
     clang::DeclContext *decl_ctx = clang.GetDeclContextForType(record_ct);
diff --git a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
index 5efa5bccb85f51a..e915bff9e4a4790 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
@@ -49,13 +49,13 @@ using namespace llvm::pdb;
 static int TranslateUdtKind(PDB_UdtType pdb_kind) {
   switch (pdb_kind) {
   case PDB_UdtType::Class:
-    return clang::TTK_Class;
+    return llvm::to_underlying(clang::TagTypeKind::Class);
   case PDB_UdtType::Struct:
-    return clang::TTK_Struct;
+    return llvm::to_underlying(clang::TagTypeKind::Struct);
   case PDB_UdtType::Union:
-    return clang::TTK_Union;
+    return llvm::to_underlying(clang::TagTypeKind::Union);
   case PDB_UdtType::Interface:
-    return clang::TTK_Interface;
+    return llvm::to_underlying(clang::TagTypeKind::Interface);
   }
   llvm_unreachable("unsuported PDB UDT type");
 }
@@ -1387,9 +1387,9 @@ void PDBASTParser::AddRecordBases(
     auto is_virtual = base->isVirtualBaseClass();
 
     std::unique_ptr<clang::CXXBaseSpecifier> base_spec =
-        m_ast.CreateBaseClassSpecifier(base_comp_type.GetOpaqueQualType(),
-                                       access, is_virtual,
-                                       record_kind == clang::TTK_Class);
+        m_ast.CreateBaseClassSpecifier(
+            base_comp_type.GetOpaqueQualType(), access, is_virtual,
+            record_kind == llvm::to_underlying(clang::TagTypeKind::Class));
     lldbassert(base_spec);
 
     base_classes.push_back(std::move(base_spec));
diff --git a/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp b/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp
index f4ddc9e869148b3..dbceaf0ed4f4412 100644
--- a/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp
+++ b/lldb/source/Plugins/SystemRuntime/MacOSX/SystemRuntimeMacOSX.cpp
@@ -420,7 +420,8 @@ void SystemRuntimeMacOSX::ReadLibdispatchTSDIndexes() {
           scratch_ts_sp->GetBuiltinTypeForEncodingAndBitSize(eEncodingUint, 16);
       CompilerType dispatch_tsd_indexes_s = scratch_ts_sp->CreateRecordType(
           nullptr, OptionalClangModuleID(), lldb::eAccessPublic,
-          "__lldb_dispatch_tsd_indexes_s", clang::TTK_Struct,
+          "__lldb_dispatch_tsd_indexes_s",
+          llvm::to_underlying(clang::TagTypeKind::Struct),
           lldb::eLanguageTypeC);
 
       TypeSystemClang::StartTagDeclarationDefinition(dispatch_tsd_indexes_s);
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 5f64e0e4abaf9ff..6f65587c4acedd1 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -2320,8 +2320,9 @@ CompilerType TypeSystemClang::CreateStructForIdentifier(
     return type;
   }
 
-  type = CreateRecordType(nullptr, OptionalClangModuleID(), lldb::eAccessPublic,
-                          type_name, clang::TTK_Struct, lldb::eLanguageTypeC);
+  type = CreateRecordType(
+      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, type_name,
+      llvm::to_underlying(clang::TagTypeKind::Struct), lldb::eLanguageTypeC);
   StartTagDeclarationDefinition(type);
   for (const auto &field : type_fields)
     AddFieldToRecordType(type, field.first, field.second, lldb::eAccessPublic,
diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
index ec5cc776d960fbe..c83e6ed1d418922 100644
--- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp
+++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
@@ -295,7 +295,8 @@ TEST_F(TestTypeSystemClang, TestOwningModule) {
 
   CompilerType record_type = ast.CreateRecordType(
       nullptr, OptionalClangModuleID(200), lldb::eAccessPublic, "FooRecord",
-      clang::TTK_Struct, lldb::eLanguageTypeC_plus_plus, nullptr);
+      llvm::to_underlying(clang::TagTypeKind::Struct),
+      lldb::eLanguageTypeC_plus_plus, nullptr);
   auto *rd = TypeSystemClang::GetAsRecordDecl(record_type);
   EXPECT_FALSE(!rd);
   EXPECT_EQ(rd->getOwningModuleID(), 200u);
@@ -315,7 +316,8 @@ TEST_F(TestTypeSystemClang, TestIsClangType) {
   CompilerType bool_type(m_ast->weak_from_this(), bool_ctype);
   CompilerType record_type = m_ast->CreateRecordType(
       nullptr, OptionalClangModuleID(100), lldb::eAccessPublic, "FooRecord",
-      clang::TTK_Struct, lldb::eLanguageTypeC_plus_plus, nullptr);
+      llvm::to_underlying(clang::TagTypeKind::Struct),
+      lldb::eLanguageTypeC_plus_plus, nullptr);
   // Clang builtin type and record type should pass
   EXPECT_TRUE(ClangUtil::IsClangType(bool_type));
   EXPECT_TRUE(ClangUtil::IsClangType(record_type));
@@ -327,7 +329,8 @@ TEST_F(TestTypeSystemClang, TestIsClangType) {
 TEST_F(TestTypeSystemClang, TestRemoveFastQualifiers) {
   CompilerType record_type = m_ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "FooRecord",
-      clang::TTK_Struct, lldb::eLanguageTypeC_plus_plus, nullptr);
+      llvm::to_underlying(clang::TagTypeKind::Struct),
+      lldb::eLanguageTypeC_plus_plus, nullptr);
   QualType qt;
 
   qt = ClangUtil::GetQualType(record_type);
@@ -399,7 +402,8 @@ TEST_F(TestTypeSystemClang, TestRecordHasFields) {
   // Test that a record with no fields returns false
   CompilerType empty_base = m_ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "EmptyBase",
-      clang::TTK_Struct, lldb::eLanguageTypeC_plus_plus, nullptr);
+      llvm::to_underlying(clang::TagTypeKind::Struct),
+      lldb::eLanguageTypeC_plus_plus, nullptr);
   TypeSystemClang::StartTagDeclarationDefinition(empty_base);
   TypeSystemClang::CompleteTagDeclarationDefinition(empty_base);
 
@@ -410,7 +414,8 @@ TEST_F(TestTypeSystemClang, TestRecordHasFields) {
   // Test that a record with direct fields returns true
   CompilerType non_empty_base = m_ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "NonEmptyBase",
-      clang::TTK_Struct, lldb::eLanguageTypeC_plus_plus, nullptr);
+      llvm::to_underlying(clang::TagTypeKind::Struct),
+      lldb::eLanguageTypeC_plus_plus, nullptr);
   TypeSystemClang::StartTagDeclarationDefinition(non_empty_base);
   FieldDecl *non_empty_base_field_decl = m_ast->AddFieldToRecordType(
       non_empty_base, "MyField", int_type, eAccessPublic, 0);
@@ -426,7 +431,8 @@ TEST_F(TestTypeSystemClang, TestRecordHasFields) {
   // Test that a record with no direct fields, but fields in a base returns true
   CompilerType empty_derived = m_ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "EmptyDerived",
-      clang::TTK_Struct, lldb::eLanguageTypeC_plus_plus, nullptr);
+      llvm::to_underlying(clang::TagTypeKind::Struct),
+      lldb::eLanguageTypeC_plus_plus, nullptr);
   TypeSystemClang::StartTagDeclarationDefinition(empty_derived);
   std::unique_ptr<clang::CXXBaseSpecifier> non_empty_base_spec =
       m_ast->CreateBaseClassSpecifier(non_empty_base.GetOpaqueQualType(),
@@ -448,7 +454,8 @@ TEST_F(TestTypeSystemClang, TestRecordHasFields) {
   // returns true
   CompilerType empty_derived2 = m_ast->CreateRecordType(
       nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "EmptyDerived2",
-      clang::TTK_Struct, lldb::eLanguageTypeC_plus_plus, nullptr);
+      llvm::to_underlying(clang::TagTypeKind::Struct),
+      lldb::eLanguageTypeC_plus_plus, nullptr);
   TypeSystemClang::StartTagDeclarationDefinition(empty_derived2);
   std::unique_ptr<CXXBaseSpecifier> non_empty_vbase_spec =
       m_ast->CreateBaseClassSpecifier(non_empty_base.GetOpaqueQualType(),
@@ -479,14 +486,14 @@ TEST_F(TestTypeSystemClang, TemplateArguments) {
   // template<typename T, int I> struct foo;
   ClassTemplateDecl *decl = m_ast->CreateClassTemplateDecl(
       m_ast->GetTranslationUnitDecl(), OptionalClangModuleID(), eAccessPublic,
-      "foo", TTK_Struct, infos);
+      "foo", llvm::to_underlying(clang::TagTypeKind::Struct), infos);
   ASSERT_NE(decl, nullptr);
 
   // foo<int, 47>
   ClassTemplateSpecializationDecl *spec_decl =
       m_ast->CreateClassTemplateSpecializationDecl(
           m_ast->GetTranslationUnitDecl(), OptionalClangModuleID(), decl,
-          TTK_Struct, infos);
+          llvm::to_underlying(clang::TagTypeKind::Struct), infos);
   ASSERT_NE(spec_decl, nullptr);
   CompilerType type = m_ast->CreateClassTemplateSpecializationType(spec_decl);
   ASSERT_TRUE(type);
@@ -543,7 +550,7 @@ class TestCreateClassTemplateDecl : public TestTypeSystemClang {
   CreateClassTemplate(const TypeSystemClang::TemplateParameterInfos &infos) {
     ClassTemplateDecl *decl = m_ast->CreateClassTemplateDecl(
         m_ast->GetTranslationUnitDecl(), OptionalClangModuleID(), eAccessPublic,
-        "foo", TTK_Struct, infos);
+        "foo", llvm::to_underlying(clang::TagTypeKind::Struct), infos);
     return decl;
   }
 

>From 7c93452e174dd182c36471bc1e8272f26c0ae6db Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 3 Nov 2023 10:50:03 -0700
Subject: [PATCH 55/76] [indvars] Restructure getExtendedOperandRecurrence
 [nfc]

As suggested during review of https://github.com/llvm/llvm-project/pull/70990.
---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 39 ++++++++++----------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 9b91d74b1d2fac5..a618d72b406b397 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1383,32 +1383,33 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
       DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
   assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
 
-  const SCEV *ExtendOperExpr = nullptr;
   const OverflowingBinaryOperator *OBO =
     cast<OverflowingBinaryOperator>(DU.NarrowUse);
   ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
-  if (ExtKind == ExtendKind::Sign && OBO->hasNoSignedWrap())
-    ExtendOperExpr = SE->getSignExtendExpr(
-      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else if (ExtKind == ExtendKind::Zero && OBO->hasNoUnsignedWrap())
-    ExtendOperExpr = SE->getZeroExtendExpr(
-      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else if (DU.NeverNegative) {
+  if (!(ExtKind == ExtendKind::Sign && OBO->hasNoSignedWrap()) &&
+      !(ExtKind == ExtendKind::Zero && OBO->hasNoUnsignedWrap())) {
+    ExtKind = ExtendKind::Unknown;
+
     // For a non-negative NarrowDef, we can choose either type of
     // extension.  We want to use the current extend kind if legal
     // (see above), and we only hit this code if we need to check
     // the opposite case.
-    if (OBO->hasNoSignedWrap()) {
-      ExtKind = ExtendKind::Sign;
-      ExtendOperExpr = SE->getSignExtendExpr(
-        SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-    } else if (OBO->hasNoUnsignedWrap()) {
-      ExtKind = ExtendKind::Zero;
-      ExtendOperExpr = SE->getZeroExtendExpr(
-        SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-    } else
-      return {nullptr, ExtendKind::Unknown};
-  } else
+    if (DU.NeverNegative) {
+      if (OBO->hasNoSignedWrap()) {
+        ExtKind = ExtendKind::Sign;
+      } else if (OBO->hasNoUnsignedWrap()) {
+        ExtKind = ExtendKind::Zero;
+      }
+    }
+  }
+
+  const SCEV *ExtendOperExpr =
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx));
+  if (ExtKind == ExtendKind::Sign)
+    ExtendOperExpr = SE->getSignExtendExpr(ExtendOperExpr, WideType);
+  else if (ExtKind == ExtendKind::Zero)
+    ExtendOperExpr = SE->getZeroExtendExpr(ExtendOperExpr, WideType);
+  else
     return {nullptr, ExtendKind::Unknown};
 
   // When creating this SCEV expr, don't apply the current operations NSW or NUW

>From 141122ece3c09a2f2e3c0280687633820bf632d5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 3 Nov 2023 17:52:51 +0000
Subject: [PATCH 56/76] [TableGen] Use StringRef::starts_with/ends_with instead
 of startswith/endswith. NFC.

startswith/endswith wrap starts_with/ends_with and will eventually go away (to more closely match string_view)
---
 clang/utils/TableGen/ClangAttrEmitter.cpp            |  6 +++---
 clang/utils/TableGen/NeonEmitter.cpp                 | 12 ++++++------
 llvm/utils/TableGen/AsmMatcherEmitter.cpp            |  6 +++---
 llvm/utils/TableGen/CallingConvEmitter.cpp           | 12 ++++++------
 llvm/utils/TableGen/CodeGenSchedule.cpp              |  4 ++--
 llvm/utils/TableGen/DXILEmitter.cpp                  |  2 +-
 llvm/utils/TableGen/GlobalISel/CodeExpander.cpp      |  8 ++++----
 .../TableGen/GlobalISelMatchTableExecutorEmitter.h   |  2 +-
 llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp     |  4 ++--
 llvm/utils/TableGen/X86FoldTablesEmitter.cpp         |  4 ++--
 10 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 4231bcbb362539d..4ec00573e8a9da8 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -161,7 +161,7 @@ static StringRef NormalizeNameForSpellingComparison(StringRef Name) {
 // Normalize the spelling of a GNU attribute (i.e. "x" in "__attribute__((x))"),
 // removing "__" if it appears at the beginning and end of the attribute's name.
 static StringRef NormalizeGNUAttrSpelling(StringRef AttrSpelling) {
-  if (AttrSpelling.startswith("__") && AttrSpelling.endswith("__")) {
+  if (AttrSpelling.starts_with("__") && AttrSpelling.ends_with("__")) {
     AttrSpelling = AttrSpelling.substr(2, AttrSpelling.size() - 4);
   }
 
@@ -356,7 +356,7 @@ namespace {
     }
 
     void writeDump(raw_ostream &OS) const override {
-      if (StringRef(type).endswith("Decl *")) {
+      if (StringRef(type).ends_with("Decl *")) {
         OS << "    OS << \" \";\n";
         OS << "    dumpBareDeclRef(SA->get" << getUpperName() << "());\n";
       } else if (type == "IdentifierInfo *") {
@@ -4537,7 +4537,7 @@ void EmitClangAttrParsedAttrImpl(RecordKeeper &Records, raw_ostream &OS) {
         continue;
       ArgNames.push_back(Arg->getValueAsString("Name").str());
       for (const auto &Class : Arg->getSuperClasses()) {
-        if (Class.first->getName().startswith("Variadic")) {
+        if (Class.first->getName().starts_with("Variadic")) {
           ArgNames.back().append("...");
           break;
         }
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 3c891dbe9d7aa0f..4b112972a1ec981 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -736,17 +736,17 @@ Type Type::fromTypedefName(StringRef Name) {
     Name = Name.drop_front();
   }
 
-  if (Name.startswith("float")) {
+  if (Name.starts_with("float")) {
     T.Kind = Float;
     Name = Name.drop_front(5);
-  } else if (Name.startswith("poly")) {
+  } else if (Name.starts_with("poly")) {
     T.Kind = Poly;
     Name = Name.drop_front(4);
-  } else if (Name.startswith("bfloat")) {
+  } else if (Name.starts_with("bfloat")) {
     T.Kind = BFloat16;
     Name = Name.drop_front(6);
   } else {
-    assert(Name.startswith("int"));
+    assert(Name.starts_with("int"));
     Name = Name.drop_front(3);
   }
 
@@ -787,7 +787,7 @@ Type Type::fromTypedefName(StringRef Name) {
     Name = Name.drop_front(I);
   }
 
-  assert(Name.startswith("_t") && "Malformed typedef!");
+  assert(Name.starts_with("_t") && "Malformed typedef!");
   return T;
 }
 
@@ -1655,7 +1655,7 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagShuffle(DagInit *DI){
   std::string S = "__builtin_shufflevector(" + Arg1.second + ", " + Arg2.second;
   for (auto &E : Elts) {
     StringRef Name = E->getName();
-    assert_with_loc(Name.startswith("sv"),
+    assert_with_loc(Name.starts_with("sv"),
                     "Incorrect element kind in shuffle mask!");
     S += ", " + Name.drop_front(2).str();
   }
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 6231f5530d35146..f774f0c1018b3e1 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -910,7 +910,7 @@ extractSingletonRegisterForAsmOperand(MatchableInfo::AsmOperand &Op,
     return;
   }
 
-  if (!Tok.startswith(RegisterPrefix))
+  if (!Tok.starts_with(RegisterPrefix))
     return;
 
   StringRef RegName = Tok.substr(RegisterPrefix.size());
@@ -1520,7 +1520,7 @@ void AsmMatcherInfo::buildInfo() {
 
       // If the tblgen -match-prefix option is specified (for tblgen hackers),
       // filter the set of instructions we consider.
-      if (!StringRef(CGI->TheDef->getName()).startswith(MatchPrefix))
+      if (!StringRef(CGI->TheDef->getName()).starts_with(MatchPrefix))
         continue;
 
       // Ignore "codegen only" instructions.
@@ -1555,7 +1555,7 @@ void AsmMatcherInfo::buildInfo() {
       // filter the set of instruction aliases we consider, based on the target
       // instruction.
       if (!StringRef(Alias->ResultInst->TheDef->getName())
-            .startswith( MatchPrefix))
+               .starts_with(MatchPrefix))
         continue;
 
       StringRef V = Alias->TheDef->getValueAsString("AsmVariantName");
diff --git a/llvm/utils/TableGen/CallingConvEmitter.cpp b/llvm/utils/TableGen/CallingConvEmitter.cpp
index de3810b2e227916..06670e84d857260 100644
--- a/llvm/utils/TableGen/CallingConvEmitter.cpp
+++ b/llvm/utils/TableGen/CallingConvEmitter.cpp
@@ -106,12 +106,12 @@ void CallingConvEmitter::EmitCallingConv(Record *CC, raw_ostream &O) {
   // Emit all of the actions, in order.
   for (unsigned i = 0, e = CCActions->size(); i != e; ++i) {
     Record *Action = CCActions->getElementAsRecord(i);
-    SwiftAction = llvm::any_of(Action->getSuperClasses(),
-                               [](const std::pair<Record *, SMRange> &Class) {
-                                 std::string Name =
-                                     Class.first->getNameInitAsString();
-                                 return StringRef(Name).startswith("CCIfSwift");
-                               });
+    SwiftAction =
+        llvm::any_of(Action->getSuperClasses(),
+                     [](const std::pair<Record *, SMRange> &Class) {
+                       std::string Name = Class.first->getNameInitAsString();
+                       return StringRef(Name).starts_with("CCIfSwift");
+                     });
 
     O << "\n";
     EmitAction(Action, 2, O);
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
index c753d9f9edd4319..c3c5e4f8eb2d8c3 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -118,7 +118,7 @@ struct InstRegexOp : public SetTheory::Operator {
       // The generic opcodes are unsorted, handle them manually.
       for (auto *Inst : Generics) {
         StringRef InstName = Inst->TheDef->getName();
-        if (InstName.startswith(Prefix) &&
+        if (InstName.starts_with(Prefix) &&
             (!Regexpr || Regexpr->match(InstName.substr(Prefix.size())))) {
           Elts.insert(Inst->TheDef);
           NumMatches++;
@@ -134,7 +134,7 @@ struct InstRegexOp : public SetTheory::Operator {
         }
         bool operator()(StringRef LHS, const CodeGenInstruction *RHS) {
           return LHS < RHS->TheDef->getName() &&
-                 !RHS->TheDef->getName().startswith(LHS);
+                 !RHS->TheDef->getName().starts_with(LHS);
         }
       };
       auto Range1 =
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index b294c66007f8419..a199463961be419 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -81,7 +81,7 @@ struct DXILOperationData {
     if (R->getValue("llvm_intrinsic")) {
       auto *IntrinsicDef = R->getValueAsDef("llvm_intrinsic");
       auto DefName = IntrinsicDef->getName();
-      assert(DefName.startswith("int_") && "invalid intrinsic name");
+      assert(DefName.starts_with("int_") && "invalid intrinsic name");
       // Remove the int_ from intrinsic name.
       Intrinsic = DefName.substr(4);
     }
diff --git a/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp b/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
index 42b4aabf2755e7f..20f98bef4887c74 100644
--- a/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
+++ b/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
@@ -31,24 +31,24 @@ void CodeExpander::emit(raw_ostream &OS) const {
     OS << Current.substr(0, Pos);
     Current = Current.substr(Pos);
 
-    if (Current.startswith("\n")) {
+    if (Current.starts_with("\n")) {
       OS << "\n" << Indent;
       Current = Current.drop_front(1);
       continue;
     }
 
-    if (Current.startswith("\\$") || Current.startswith("\\\\")) {
+    if (Current.starts_with("\\$") || Current.starts_with("\\\\")) {
       OS << Current[1];
       Current = Current.drop_front(2);
       continue;
     }
 
-    if (Current.startswith("\\")) {
+    if (Current.starts_with("\\")) {
       Current = Current.drop_front(1);
       continue;
     }
 
-    if (Current.startswith("${")) {
+    if (Current.starts_with("${")) {
       StringRef StartVar = Current;
       Current = Current.drop_front(2);
       StringRef Var;
diff --git a/llvm/utils/TableGen/GlobalISelMatchTableExecutorEmitter.h b/llvm/utils/TableGen/GlobalISelMatchTableExecutorEmitter.h
index 13193ff8cc9fefd..c30198f11195c68 100644
--- a/llvm/utils/TableGen/GlobalISelMatchTableExecutorEmitter.h
+++ b/llvm/utils/TableGen/GlobalISelMatchTableExecutorEmitter.h
@@ -110,7 +110,7 @@ class GlobalISelMatchTableExecutorEmitter {
         OS << "  case GICXXPred_" << TypeIdentifier << "_Predicate_"
            << GetPredEnumName(Pred) << ": {\n"
            << "    " << Code << "\n";
-        if (!StringRef(Code).ltrim().startswith("return")) {
+        if (!StringRef(Code).ltrim().starts_with("return")) {
           OS << "    llvm_unreachable(\"" << GetPredEnumName(Pred)
              << " should have returned\");\n";
         }
diff --git a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
index 4b71174604c4f11..9871cf62cc0ad68 100644
--- a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
@@ -170,7 +170,7 @@ void X86EVEX2VEXTablesEmitter::run(raw_ostream &OS) {
     // Currently we only do AVX related checks and assume each instruction
     // has one and only one AVX related predicates.
     for (unsigned i = 0, e = PredicatesRecords.size(); i != e; ++i)
-      if (PredicatesRecords[i]->getName().startswith("HasAVX"))
+      if (PredicatesRecords[i]->getName().starts_with("HasAVX"))
         return PredicatesRecords[i]->getValueAsString("CondString");
     llvm_unreachable(
         "Instruction with checkPredicate set must have one predicate!");
@@ -187,7 +187,7 @@ void X86EVEX2VEXTablesEmitter::run(raw_ostream &OS) {
     if (!Def->isSubClassOf("X86Inst"))
       continue;
     // _REV instruction should not appear before encoding optimization
-    if (Def->getName().endswith("_REV"))
+    if (Def->getName().ends_with("_REV"))
       continue;
     RecognizableInstrBase RI(*Inst);
 
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 6144e8b214c9866..86e8de89bc7e539 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -439,7 +439,7 @@ void X86FoldTablesEmitter::addEntryWithFlags(FoldTable &Table,
   // Check no-kz version's isMoveReg
   StringRef RegInstName = RegRec->getName();
   unsigned DropLen =
-      RegInstName.endswith("rkz") ? 2 : (RegInstName.endswith("rk") ? 1 : 0);
+      RegInstName.ends_with("rkz") ? 2 : (RegInstName.ends_with("rk") ? 1 : 0);
   Record *BaseDef =
       DropLen ? Records.getDef(RegInstName.drop_back(DropLen)) : nullptr;
   bool IsMoveReg =
@@ -598,7 +598,7 @@ void X86FoldTablesEmitter::run(raw_ostream &o) {
     if (Match != OpcRegInsts.end()) {
       const CodeGenInstruction *RegInst = *Match;
       StringRef RegInstName = RegInst->TheDef->getName();
-      if (RegInstName.endswith("_REV") || RegInstName.endswith("_alt")) {
+      if (RegInstName.ends_with("_REV") || RegInstName.ends_with("_alt")) {
         if (auto *RegAltRec = Records.getDef(RegInstName.drop_back(4))) {
           RegInst = &Target.getInstruction(RegAltRec);
         }

>From af599389f5a346491dd6acab633bc86eeb0fe444 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Fri, 3 Nov 2023 10:56:29 -0700
Subject: [PATCH 57/76] [ELF] Improve --no-allow-shlib-undefined tests

---
 lld/test/ELF/Inputs/allow-shlib-undefined.s |   5 +-
 lld/test/ELF/allow-shlib-undefined.s        | 121 ++++++++++++--------
 2 files changed, 76 insertions(+), 50 deletions(-)

diff --git a/lld/test/ELF/Inputs/allow-shlib-undefined.s b/lld/test/ELF/Inputs/allow-shlib-undefined.s
index 5e3a4617902781b..ec7e73d58c2ec34 100644
--- a/lld/test/ELF/Inputs/allow-shlib-undefined.s
+++ b/lld/test/ELF/Inputs/allow-shlib-undefined.s
@@ -1,3 +1,6 @@
 .globl _shared
+.weak x2
 _shared:
-  callq _unresolved at PLT
+  callq x1 at PLT
+
+       callq x2 at PLT
diff --git a/lld/test/ELF/allow-shlib-undefined.s b/lld/test/ELF/allow-shlib-undefined.s
index 03f047b02d75d52..5cab2160b10e0d1 100644
--- a/lld/test/ELF/allow-shlib-undefined.s
+++ b/lld/test/ELF/allow-shlib-undefined.s
@@ -1,60 +1,83 @@
 # REQUIRES: x86
 
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux \
-# RUN:   %p/Inputs/allow-shlib-undefined.s -o %t1.o
-# RUN: ld.lld -shared %t1.o -o %t.so
-
-# RUN: ld.lld --allow-shlib-undefined %t.o %t.so -o /dev/null
-# RUN: not ld.lld --no-allow-shlib-undefined %t.o %t.so -o /dev/null 2>&1 | FileCheck %s
-# Executable defaults to --no-allow-shlib-undefined
-# RUN: not ld.lld %t.o %t.so -o /dev/null 2>&1 | FileCheck %s
-# RUN: ld.lld %t.o %t.so --noinhibit-exec -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN
-# RUN: ld.lld %t.o %t.so --warn-unresolved-symbols -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN
-# -shared defaults to --allow-shlib-undefined
-# RUN: ld.lld -shared %t.o %t.so -o /dev/null
-
-# RUN: echo | llvm-mc -filetype=obj -triple=x86_64-unknown-linux -o %tempty.o
-# RUN: ld.lld -shared %tempty.o -o %tempty.so
-# RUN: ld.lld -shared %t1.o %tempty.so -o %t2.so
-# RUN: ld.lld --no-allow-shlib-undefined %t.o %t2.so -o /dev/null
-
-# DSO with undefines:
-# should link with or without any of these options.
-# RUN: ld.lld -shared %t1.o -o /dev/null
-# RUN: ld.lld -shared --allow-shlib-undefined %t1.o -o /dev/null
-# RUN: ld.lld -shared --no-allow-shlib-undefined %t1.o -o /dev/null
-
-## Check that the error is reported if an unresolved symbol is first seen in a
-## regular object file.
-# RUN: echo 'callq _unresolved at PLT' | \
-# RUN:   llvm-mc -filetype=obj -triple=x86_64-unknown-linux - -o %tref.o
-# RUN: not ld.lld --gc-sections %t.o %tref.o %t.so -o /dev/null 2>&1 | FileCheck %s
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 main.s -o main.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 def.s -o def.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 def-hidden.s -o def-hidden.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 ref.s -o ref.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o && ld.lld -shared a.o -o a.so
+# RUN: cp a.so b.so
+# RUN: llvm-mc -filetype=obj -triple=x86_64 empty.s -o empty.o && ld.lld -shared empty.o -o empty.so
+
+# RUN: ld.lld --allow-shlib-undefined main.o a.so -o /dev/null
+# RUN: not ld.lld --no-allow-shlib-undefined main.o a.so -o /dev/null 2>&1 | FileCheck %s
+## Executable linking defaults to --no-allow-shlib-undefined.
+# RUN: not ld.lld main.o a.so -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld main.o a.so --noinhibit-exec -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN
+# RUN: ld.lld main.o a.so --warn-unresolved-symbols -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN
+## -shared linking defaults to --allow-shlib-undefined.
+# RUN: ld.lld -shared main.o a.so -o /dev/null
+
+## DSO with undefines should link with or without any of these options.
+# RUN: ld.lld -shared --allow-shlib-undefined a.o -o /dev/null
+# RUN: ld.lld -shared --no-allow-shlib-undefined a.o -o /dev/null
+
+## Perform checking even if an unresolved symbol is first seen in a regular object file.
+# RUN: not ld.lld --gc-sections main.o ref.o a.so -o /dev/null 2>&1 | FileCheck %s
 
 ## Check that the error is reported for each shared library where the symbol
 ## is referenced.
-# RUN: cp %t.so %t2.so
-# RUN: not ld.lld %t.o %t.so %t2.so -o /dev/null 2>&1 | \
-# RUN:   FileCheck %s --check-prefixes=CHECK,CHECK2
+# RUN: not ld.lld main.o a.so empty.so b.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK2
 
-## Test some cases where relocatable object files provide a hidden definition.
-# RUN: echo '.globl _unresolved; _unresolved:' | llvm-mc -filetype=obj -triple=x86_64 -o %tdef.o
-# RUN: echo '.globl _unresolved; .hidden _unresolved; _unresolved:' | llvm-mc -filetype=obj -triple=x86_64 -o %tdef-hidden.o
-# RUN: ld.lld %t.o %t.so %tdef-hidden.o -o /dev/null 2>&1 | count 0
+## Test some cases when a relocatable object file provides a non-exported definition.
+# RUN: ld.lld main.o a.so def-hidden.o --fatal-warnings -o /dev/null
+# RUN: ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined --fatal-warnings -o /dev/null
+# RUN: ld.lld main.o a.so def-hidden.o --allow-shlib-undefined --fatal-warnings -o /dev/null
+## Test a relocatable object file definition that is converted to STB_LOCAL.
+# RUN: ld.lld main.o a.so def-hidden.o --version-script=local.ver --fatal-warnings -o /dev/null
 
 ## The section containing the definition is discarded, and we report an error.
-# RUN: not ld.lld --gc-sections %t.o %t.so %tdef-hidden.o -o /dev/null 2>&1 | FileCheck %s
-## The definition %tdef.so is ignored.
-# RUN: ld.lld -shared -soname=tdef.so %tdef.o -o %tdef.so
-# RUN: not ld.lld --gc-sections %t.o %t.so %tdef.so %tdef-hidden.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: not ld.lld --gc-sections main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s
+## The definition def.so is ignored.
+# RUN: ld.lld -shared def.o -o def.so
+# RUN: not ld.lld --gc-sections main.o a.so def.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK-NOT:   error:
+# CHECK:       error: undefined reference due to --no-allow-shlib-undefined: x1{{$}}
+# CHECK-NEXT:  >>> referenced by a.so{{$}}
+# CHECK-NOT:   {{.}}
 
+# CHECK2-NOT:  error:
+# CHECK2:      error: undefined reference due to --no-allow-shlib-undefined: x1
+# CHECK2-NEXT: >>> referenced by a.so
+# CHECK2:      error: undefined reference due to --no-allow-shlib-undefined: x1
+# CHECK2-NEXT: >>> referenced by b.so
+# CHECK2-NOT:  {{.}}
+
+# WARN:        warning: undefined reference due to --no-allow-shlib-undefined: x1
+# WARN-NEXT:   >>> referenced by a.so
+
+#--- main.s
 .globl _start
 _start:
-  callq _shared at PLT
-
-# CHECK:       error: undefined reference due to --no-allow-shlib-undefined: _unresolved
-# CHECK-NEXT:  >>> referenced by {{.*}}.so
-# CHECK2:      error: undefined reference due to --no-allow-shlib-undefined: _unresolved
-# CHECK2-NEXT: >>> referenced by {{.*}}2.so
-# WARN:        warning: undefined reference due to --no-allow-shlib-undefined: _unresolved
-# WARN-NEXT:   >>> referenced by {{.*}}.so
+  callq shared at PLT
+#--- ref.s
+  callq x1 at PLT
+#--- def.s
+.globl x1
+x1:
+#--- def-hidden.s
+.globl x1
+.hidden x1
+x1:
+
+#--- a.s
+.globl shared
+.weak x2
+shared:
+  callq x1 at PLT
+  movq x2 at GOTPCREL(%rip), %rax
+
+#--- empty.s
+#--- local.ver
+v1 { local: x1; };

>From 3fe69bab1149e20f87ac31f8b2557baa88fcdd14 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Fri, 3 Nov 2023 11:01:40 -0700
Subject: [PATCH 58/76] Improve llvm-config.h to record which target is
 configured or not (#71164)

These macro can help guarding some tests and other section of code
which rely on detecting if a particular target is available. This
is common the MLIR codegeneration for GPU targets for example.
---
 llvm/CMakeLists.txt                          |  2 +
 llvm/include/llvm/Config/llvm-config.h.cmake | 75 ++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 292efa3316df748..7ff3acd48304de7 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -955,6 +955,8 @@ foreach(t ${LLVM_TARGETS_TO_BUILD})
     endif()
   else()
     set(LLVM_ENUM_TARGETS "${LLVM_ENUM_TARGETS}LLVM_TARGET(${t})\n")
+    string(TOUPPER ${t} T_UPPER)
+    set(LLVM_HAS_${T_UPPER}_TARGET 1)
   endif()
 
   file(GLOB asmp_file "${td}/*AsmPrinter.cpp")
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 17b2d47fb6c43a3..6605ea60df99e14 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -54,6 +54,81 @@
 /* LLVM name for the native target MCA init function, if available */
 #cmakedefine LLVM_NATIVE_TARGETMCA LLVMInitialize${LLVM_NATIVE_ARCH}TargetMCA
 
+/* Define if the AArch64 target is built in */
+#cmakedefine01 LLVM_HAS_AARCH64_TARGET
+
+/* Define if the AMDGPU target is built in */
+#cmakedefine01 LLVM_HAS_AMDGPU_TARGET
+
+/* Define if the ARC target is built in */
+#cmakedefine01 LLVM_HAS_ARC_TARGET
+
+/* Define if the ARM target is built in */
+#cmakedefine01 LLVM_HAS_ARM_TARGET
+
+/* Define if the AVR target is built in */
+#cmakedefine01 LLVM_HAS_AVR_TARGET
+
+/* Define if the BPF target is built in */
+#cmakedefine01 LLVM_HAS_BPF_TARGET
+
+/* Define if the CSKY target is built in */
+#cmakedefine01 LLVM_HAS_CSKY_TARGET
+
+/* Define if the DirectX target is built in */
+#cmakedefine01 LLVM_HAS_DIRECTX_TARGET
+
+/* Define if the Hexagon target is built in */
+#cmakedefine01 LLVM_HAS_HEXAGON_TARGET
+
+/* Define if the Lanai target is built in */
+#cmakedefine01 LLVM_HAS_LANAI_TARGET
+
+/* Define if the LoongArch target is built in */
+#cmakedefine01 LLVM_HAS_LOONGARCH_TARGET
+
+/* Define if the M68k target is built in */
+#cmakedefine01 LLVM_HAS_M68K_TARGET
+
+/* Define if the Mips target is built in */
+#cmakedefine01 LLVM_HAS_MIPS_TARGET
+
+/* Define if the MSP430 target is built in */
+#cmakedefine01 LLVM_HAS_MSP430_TARGET
+
+/* Define if the NVPTX target is built in */
+#cmakedefine01 LLVM_HAS_NVPTX_TARGET
+
+/* Define if the PowerPC target is built in */
+#cmakedefine01 LLVM_HAS_POWERPC_TARGET
+
+/* Define if the RISCV target is built in */
+#cmakedefine01 LLVM_HAS_RISCV_TARGET
+
+/* Define if the Sparc target is built in */
+#cmakedefine01 LLVM_HAS_SPARC_TARGET
+
+/* Define if the SPIRV target is built in */
+#cmakedefine01 LLVM_HAS_SPIRV_TARGET
+
+/* Define if the SystemZ target is built in */
+#cmakedefine01 LLVM_HAS_SYSTEMZ_TARGET
+
+/* Define if the VE target is built in */
+#cmakedefine01 LLVM_HAS_VE_TARGET
+
+/* Define if the WebAssembly target is built in */
+#cmakedefine01 LLVM_HAS_WEBASSEMBLY_TARGET
+
+/* Define if the X86 target is built in */
+#cmakedefine01 LLVM_HAS_X86_TARGET
+
+/* Define if the XCore target is built in */
+#cmakedefine01 LLVM_HAS_XCORE_TARGET
+
+/* Define if the Xtensa target is built in */
+#cmakedefine01 LLVM_HAS_XTENSA_TARGET
+
 /* Define if this is Unixish platform */
 #cmakedefine LLVM_ON_UNIX ${LLVM_ON_UNIX}
 

>From 2b76bdc33b555beb637c13b8bdbd8d3e935d9d56 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Fri, 3 Nov 2023 11:03:15 -0700
Subject: [PATCH 59/76] [clang-doc] Fix a warning

This patch fixes:

  clang-tools-extra/clang-doc/BitcodeReader.cpp:78:3: error: default
  label in switch which covers all enumeration values
  [-Werror,-Wcovered-switch-default]
---
 clang-tools-extra/clang-doc/BitcodeReader.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clang-doc/BitcodeReader.cpp b/clang-tools-extra/clang-doc/BitcodeReader.cpp
index 8c97186b299fc2e..bfb04e7407b3804 100644
--- a/clang-tools-extra/clang-doc/BitcodeReader.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeReader.cpp
@@ -75,10 +75,9 @@ llvm::Error decodeRecord(const Record &R, TagTypeKind &Field,
   case TagTypeKind::Enum:
     Field = static_cast<TagTypeKind>(R[0]);
     return llvm::Error::success();
-  default:
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "invalid value for TagTypeKind");
   }
+  return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                 "invalid value for TagTypeKind");
 }
 
 llvm::Error decodeRecord(const Record &R, std::optional<Location> &Field,

>From 49168b2512ef55e225e9b7cd0821daa5c8ae5a9b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Fri, 3 Nov 2023 11:05:09 -0700
Subject: [PATCH 60/76] [ELF] Enhance --no-allow-shlib-undefined to report
 non-exported definition (#70769)

For a DSO with all DT_NEEDED entries accounted for, if it contains an
undefined non-weak symbol that shares a name with a non-exported
definition (hidden visibility or localized by a version script), and
there is no DSO definition, we should also report an error. Because the
definition is not exported, it cannot resolve the DSO reference at
runtime.

GNU ld introduced this error-checking in [April
2003](https://sourceware.org/pipermail/binutils/2003-April/026568.html).
The feature is available for executable links but not for -shared, and
it is
orthogonal to --no-allow-shlib-undefined. We make the feature part of
--no-allow-shlib-undefined and work with -shared when
--no-allow-shlib-undefined is specified.

A subset of this error-checking is covered by commit
1981b1b6b92f7579a30c9ed32dbdf3bc749c1b40 for --gc-sections discarded
sections. This patch covers non-discarded sections as well.

Internally, I have identified 2 bugs (which would fail with
LD_BIND_NOW=1) covered by commit
1981b1b6b92f7579a30c9ed32dbdf3bc749c1b40
---
 lld/ELF/Writer.cpp                   | 10 ++++++++--
 lld/test/ELF/allow-shlib-undefined.s | 10 +++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 57e1aa06c6aa873..a84e4864ab0e5a5 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2023,10 +2023,16 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
             });
         if (!allNeededIsKnown)
           continue;
-        for (Symbol *sym : file->requiredSymbols)
-          if (sym->isUndefined() && !sym->isWeak())
+        for (Symbol *sym : file->requiredSymbols) {
+          if (sym->isUndefined() && !sym->isWeak()) {
             diagnose("undefined reference due to --no-allow-shlib-undefined: " +
                      toString(*sym) + "\n>>> referenced by " + toString(file));
+          } else if (sym->isDefined() && sym->computeBinding() == STB_LOCAL) {
+            diagnose("non-exported symbol '" + toString(*sym) + "' in '" +
+                     toString(sym->file) + "' is referenced by DSO '" +
+                     toString(file) + "'");
+          }
+        }
       }
     }
   }
diff --git a/lld/test/ELF/allow-shlib-undefined.s b/lld/test/ELF/allow-shlib-undefined.s
index 5cab2160b10e0d1..56b44e144661c2d 100644
--- a/lld/test/ELF/allow-shlib-undefined.s
+++ b/lld/test/ELF/allow-shlib-undefined.s
@@ -30,11 +30,11 @@
 # RUN: not ld.lld main.o a.so empty.so b.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK2
 
 ## Test some cases when a relocatable object file provides a non-exported definition.
-# RUN: ld.lld main.o a.so def-hidden.o --fatal-warnings -o /dev/null
-# RUN: ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined --fatal-warnings -o /dev/null
+# RUN: not ld.lld main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED
+# RUN: not ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED
 # RUN: ld.lld main.o a.so def-hidden.o --allow-shlib-undefined --fatal-warnings -o /dev/null
 ## Test a relocatable object file definition that is converted to STB_LOCAL.
-# RUN: ld.lld main.o a.so def-hidden.o --version-script=local.ver --fatal-warnings -o /dev/null
+# RUN: not ld.lld main.o a.so def-hidden.o --version-script=local.ver -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED
 
 ## The section containing the definition is discarded, and we report an error.
 # RUN: not ld.lld --gc-sections main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s
@@ -57,6 +57,10 @@
 # WARN:        warning: undefined reference due to --no-allow-shlib-undefined: x1
 # WARN-NEXT:   >>> referenced by a.so
 
+# NONEXPORTED-NOT: error:
+# NONEXPORTED:     error: non-exported symbol 'x1' in 'def-hidden.o' is referenced by DSO 'a.so'
+# NONEXPORTED-NOT: {{.}}
+
 #--- main.s
 .globl _start
 _start:

>From 19b5495b653a00da7a250f48b4f739fcf2bbe82f Mon Sep 17 00:00:00 2001
From: Manman Ren <manman.ren at gmail.com>
Date: Fri, 3 Nov 2023 11:13:58 -0700
Subject: [PATCH 61/76] Port Swift's merge function pass to llvm: merging
 functions that differ in constants (#68235)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See RFC for details:
https://discourse.llvm.org/t/rfc-for-moving-swift-s-merge-function-pass-to-llvm/73778

We will need to refactor extension to FunctionComparator/FunctionHash to
StructuralHash. This patch adds a new pass which is ported from Swift,
and will need to discuss on how to migrate Swift’s pass over after we
land this in llvm.

Create this PR to get some early review on the patch.

---------

Co-authored-by: Manman Ren <mren at meta.com>
---
 .../IPO/MergeFunctionsIgnoringConst.h         |   42 +
 .../Transforms/Utils/FunctionComparator.h     |    1 +
 .../Utils/FunctionComparatorIgnoringConst.h   |   58 +
 .../Utils/MergeFunctionsIgnoringConst.h       |   29 +
 llvm/lib/Passes/PassBuilder.cpp               |    1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   11 +
 llvm/lib/Passes/PassRegistry.def              |    1 +
 llvm/lib/Transforms/IPO/CMakeLists.txt        |    1 +
 .../IPO/MergeFunctionsIgnoringConst.cpp       | 1399 +++++++++++++++++
 llvm/lib/Transforms/Utils/CMakeLists.txt      |    1 +
 .../Utils/FunctionComparatorIgnoringConst.cpp |  107 ++
 .../MergeFuncIgnoringConst/merge_func.ll      |  532 +++++++
 .../merge_with_exception.ll                   |  190 +++
 13 files changed, 2373 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h
 create mode 100644 llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h
 create mode 100644 llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h
 create mode 100644 llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp
 create mode 100644 llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp
 create mode 100644 llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll
 create mode 100644 llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll

diff --git a/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h b/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h
new file mode 100644
index 000000000000000..638d009abf2bffc
--- /dev/null
+++ b/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h
@@ -0,0 +1,42 @@
+//===- MergeFunctionsIgnoringConst.h - Merge Functions ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms simple global variables that never have their address
+// taken.  If obviously true, it marks read/write globals as constant, deletes
+// variables only stored to, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_MERGEFUNCTIONSIGNORINGCONST_H
+#define LLVM_TRANSFORMS_IPO_MERGEFUNCTIONSIGNORINGCONST_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+
+/// Merge functions that differ by constants.
+class MergeFuncIgnoringConstPass
+    : public PassInfoMixin<MergeFuncIgnoringConstPass> {
+  bool PtrAuthEnabled = false;
+  unsigned PtrAuthKey = 0;
+  std::string MergeFuncSuffix = ".Tm";
+
+public:
+  MergeFuncIgnoringConstPass() {}
+  MergeFuncIgnoringConstPass(bool PtrAuthEnabled, unsigned PtrAuthKey,
+                             std::string Suffix)
+      : PtrAuthEnabled(PtrAuthEnabled), PtrAuthKey(PtrAuthKey),
+        MergeFuncSuffix(Suffix) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_MERGEFUNCTIONSIGNORINGCONST_H
diff --git a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
index c28f868039a1f7b..1a314b481c72c61 100644
--- a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
+++ b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
@@ -379,6 +379,7 @@ class FunctionComparator {
   /// But, we are still not able to compare operands of PHI nodes, since those
   /// could be operands from further BBs we didn't scan yet.
   /// So it's impossible to use dominance properties in general.
+protected:
   mutable DenseMap<const Value*, int> sn_mapL, sn_mapR;
 
   // The global state we will use
diff --git a/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h b/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h
new file mode 100644
index 000000000000000..9c7fe3baf2fa0db
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h
@@ -0,0 +1,58 @@
+//===- FunctionComparatorIgnoringConst.h - Function Comparator --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the FunctionComparatorIgnoringConst class which is used by
+// the MergeFuncIgnoringConst pass for comparing functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATORIGNORINGCONST_H
+#define LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATORIGNORINGCONST_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+#include <set>
+
+namespace llvm {
+
+/// FunctionComparatorIgnoringConst - Compares two functions to determine
+/// whether or not they match when certain constants are ignored.
+class FunctionComparatorIgnoringConst : public FunctionComparator {
+public:
+  FunctionComparatorIgnoringConst(const Function *F1, const Function *F2,
+                                  GlobalNumberState *GN)
+      : FunctionComparator(F1, F2, GN) {}
+
+  int cmpOperandsIgnoringConsts(const Instruction *L, const Instruction *R,
+                                unsigned opIdx);
+
+  int cmpBasicBlocksIgnoringConsts(
+      const BasicBlock *BBL, const BasicBlock *BBR,
+      const std::set<std::pair<int, int>> *InstOpndIndex = nullptr);
+
+  int compareIgnoringConsts(
+      const std::set<std::pair<int, int>> *InstOpndIndex = nullptr);
+
+  int compareConstants(const Constant *L, const Constant *R) const {
+    return cmpConstants(L, R);
+  }
+
+private:
+  /// Scratch index for instruction in order during cmpOperandsIgnoringConsts.
+  int Index = 0;
+};
+
+} // end namespace llvm
+#endif // LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATORIGNORINGCONST_H
diff --git a/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h b/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h
new file mode 100644
index 000000000000000..e63afbb6bbf1718
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h
@@ -0,0 +1,29 @@
+//===- MergeFunctionsIgnoringConst.h - Merge Functions ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines helpers used in the MergeFunctionsIgnoringConst.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_MERGEFUNCTIONSIGNORINGCONST_H
+#define LLVM_TRANSFORMS_UTILS_MERGEFUNCTIONSIGNORINGCONST_H
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+
+using namespace llvm;
+
+bool isEligibleInstrunctionForConstantSharing(const Instruction *I);
+
+bool isEligibleOperandForConstantSharing(const Instruction *I, unsigned OpIdx);
+
+bool isEligibleFunction(Function *F);
+
+Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy);
+#endif // LLVM_TRANSFORMS_UTILS_MERGEFUNCTIONSIGNORINGCONST_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 0d7cac19d44c3a8..789ddfcbf529879 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -123,6 +123,7 @@
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
+#include "llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h"
 #include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
 #include "llvm/Transforms/IPO/PartialInlining.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index baea2913338cda7..20dbd3952beb60f 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -60,6 +60,7 @@
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
+#include "llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h"
 #include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
 #include "llvm/Transforms/IPO/PartialInlining.h"
@@ -176,6 +177,10 @@ static cl::opt<bool> EnableMergeFunctions(
     "enable-merge-functions", cl::init(false), cl::Hidden,
     cl::desc("Enable function merging as part of the optimization pipeline"));
 
+static cl::opt<bool> EnableMergeFuncIgnoringConst(
+    "enable-merge-func-ignoring-const", cl::init(false), cl::Hidden,
+    cl::desc("Enable function merger that ignores constants"));
+
 static cl::opt<bool> EnablePostPGOLoopRotation(
     "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
     cl::desc("Run the loop rotation transformation after PGO instrumentation"));
@@ -1633,6 +1638,9 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
   MPM.addPass(buildModuleOptimizationPipeline(
       Level, ThinOrFullLTOPhase::ThinLTOPostLink));
 
+  if (EnableMergeFuncIgnoringConst)
+    MPM.addPass(MergeFuncIgnoringConstPass());
+
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
 
@@ -1958,6 +1966,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
+  if (EnableMergeFuncIgnoringConst)
+    MPM.addPass(MergeFuncIgnoringConstPass());
+
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index eb51ccef68c827d..ba32c64d18423b9 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -87,6 +87,7 @@ MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("metarenamer", MetaRenamerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
+MODULE_PASS("mergefunc-ignoring-const", MergeFuncIgnoringConstPass())
 MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
 MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass())
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 034f1587ae8df44..4dac04d3369950f 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -30,6 +30,7 @@ add_llvm_component_library(LLVMipo
   LowerTypeTests.cpp
   MemProfContextDisambiguation.cpp
   MergeFunctions.cpp
+  MergeFunctionsIgnoringConst.cpp
   ModuleInliner.cpp
   OpenMPOpt.cpp
   PartialInlining.cpp
diff --git a/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp b/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp
new file mode 100644
index 000000000000000..d6ae788ddb9e1a1
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp
@@ -0,0 +1,1399 @@
+//===--- MergeFunctionsIgnoringConst.cpp - Merge functions ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for similar functions that are mergeable and folds them.
+// The implementation is similar to LLVM's MergeFunctions pass. Instead of
+// merging identical functions, it merges functions which only differ by a few
+// constants in certain instructions.
+// This is copied from Swift's implementation.
+//
+// This pass should run after LLVM's MergeFunctions pass, because it works best
+// if there are no _identical_ functions in the module.
+// Note: it would also work for identical functions but could produce more
+// code overhead than the LLVM pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StableHashing.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/StructuralHash.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mergefunc-ignoring-const"
+
+STATISTIC(NumFunctionsMergedIgnoringConst, "Number of functions merged");
+STATISTIC(NumThunksWrittenIgnoringConst, "Number of thunks generated");
+
+static cl::opt<bool> EnableAggressiveMergeFunc(
+    "enable-aggressive-mergefunc-ignoringconst", cl::init(false), cl::Hidden,
+    cl::desc("Enable more aggressive function merger"));
+
+static cl::opt<unsigned> NumFunctionsIgnoringConstForSanityCheck(
+    "mergefunc-ignoringconst-sanity",
+    cl::desc("How many functions in module could be used for "
+             "MergeFunctionsIgnoringConst pass sanity check. "
+             "'0' disables this check. Works only with '-debug' key."),
+    cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> IgnoringConstMergeThreshold(
+    "mergefunc-ignoringconst-threshold",
+    cl::desc("Functions larger than the threshold are considered for merging."
+             "'0' disables function merging at all."),
+    cl::init(15), cl::Hidden);
+
+cl::opt<bool> UseLinkOnceODRLinkageMerging(
+    "use-linkonceodr-linkage-merging", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Use LinkeOnceODR linkage to deduplicate the identical merged function "
+        "(default = off)"));
+
+cl::opt<bool> NoInlineForMergedFunction(
+    "no-inline-merged-function", cl::init(false), cl::Hidden,
+    cl::desc("set noinline for merged function (default = off)"));
+
+static cl::opt<bool>
+    CastArrayType("merge-cast-array-type", cl::init(false), cl::Hidden,
+                  cl::desc("support for casting array type (default = off)"));
+
+static cl::opt<bool> IgnoreMusttailFunction(
+    "ignore-musttail-function", cl::init(false), cl::Hidden,
+    cl::desc(
+        "ignore functions containing callsites with musttail (default = off)"));
+
+static cl::opt<bool> AlwaysCallThunk(
+    "merge-always-call-thunk", cl::init(false), cl::Hidden,
+    cl::desc(
+        "do not replace callsites and always emit a thunk (default = off)"));
+
+static cl::list<std::string> MergeBlockRegexFilters(
+    "merge-block-regex", cl::Optional,
+    cl::desc("Block functions from merging if they match the given "
+             "regular expression"),
+    cl::ZeroOrMore);
+
+static cl::list<std::string> MergeAllowRegexFilters(
+    "merge-allow-regex", cl::Optional,
+    cl::desc("Allow functions from merging if they match the given "
+             "regular expression"),
+    cl::ZeroOrMore);
+
+bool isEligibleInstrunctionForConstantSharing(const Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::Store:
+  case Instruction::Call:
+    return true;
+  default: {
+    if (EnableAggressiveMergeFunc && I->getOpcode() == Instruction::Invoke)
+      return true;
+    return false;
+  }
+  }
+}
+
+/// Returns true if the \OpIdx operand of \p CI is the callee operand.
+static bool isCalleeOperand(const CallBase *CI, unsigned OpIdx) {
+  return &CI->getCalledOperandUse() == &CI->getOperandUse(OpIdx);
+}
+
+static bool canParameterizeCallOperand(const CallBase *CI, unsigned OpIdx) {
+  if (CI->isInlineAsm())
+    return false;
+  Function *Callee = CI->getCalledOperand()
+                         ? dyn_cast_or_null<Function>(
+                               CI->getCalledOperand()->stripPointerCasts())
+                         : nullptr;
+  if (Callee) {
+    if (Callee->isIntrinsic())
+      return false;
+    // objc_msgSend stubs must be called, and can't have their address taken.
+    if (Callee->getName().startswith("objc_msgSend$"))
+      return false;
+  }
+  if (isCalleeOperand(CI, OpIdx) &&
+      CI->getOperandBundle(LLVMContext::OB_ptrauth).has_value()) {
+    // The operand is the callee and it has already been signed. Ignore this
+    // because we cannot add another ptrauth bundle to the call instruction.
+    return false;
+  }
+  return true;
+}
+
+bool isEligibleOperandForConstantSharing(const Instruction *I, unsigned OpIdx) {
+  assert(OpIdx < I->getNumOperands() && "Invalid operand index");
+
+  if (!isEligibleInstrunctionForConstantSharing(I))
+    return false;
+
+  auto Opnd = I->getOperand(OpIdx);
+  if (!isa<Constant>(Opnd))
+    return false;
+
+  if (const auto *CI = dyn_cast<CallBase>(I))
+    return canParameterizeCallOperand(CI, OpIdx);
+
+  return true;
+}
+
+namespace {
+
+/// MergeFuncIgnoringConst finds functions which only differ by constants in
+/// certain instructions, e.g. resulting from specialized functions of layout
+/// compatible types.
+/// Such functions are merged by replacing the differing constants by a
+/// parameter. The original functions are replaced by thunks which call the
+/// merged function with the specific argument constants.
+///
+class MergeFuncIgnoringConstImpl {
+public:
+  MergeFuncIgnoringConstImpl(bool PtrAuthEnabled, unsigned PtrAuthKey,
+                             std::string Suffix)
+      : FnTree(FunctionNodeCmp(&GlobalNumbers)), PtrAuthEnabled(PtrAuthEnabled),
+        PtrAuthKey(PtrAuthKey), MergeFuncSuffix(Suffix) {}
+
+  bool runImpl(Module &M);
+
+private:
+  struct FunctionEntry;
+
+  /// Describes the set of functions which are considered as "equivalent" (i.e.
+  /// only differing by some constants).
+  struct EquivalenceClass {
+    /// The single-linked list of all functions which are a member of this
+    /// equivalence class.
+    FunctionEntry *First;
+
+    /// A very cheap hash, used to early exit if functions do not match.
+    llvm::IRHash Hash;
+
+  public:
+    // Note the hash is recalculated potentially multiple times, but it is
+    // cheap.
+    EquivalenceClass(FunctionEntry *First)
+        : First(First), Hash(StructuralHash(*First->F)) {
+      assert(!First->Next);
+    }
+  };
+
+  /// The function comparison operator is provided here so that FunctionNodes do
+  /// not need to become larger with another pointer.
+  class FunctionNodeCmp {
+    GlobalNumberState *GlobalNumbers;
+
+  public:
+    FunctionNodeCmp(GlobalNumberState *GN) : GlobalNumbers(GN) {}
+    bool operator()(const EquivalenceClass &LHS,
+                    const EquivalenceClass &RHS) const {
+      // Order first by hashes, then full function comparison.
+      if (LHS.Hash != RHS.Hash)
+        return LHS.Hash < RHS.Hash;
+      FunctionComparatorIgnoringConst FCmp(LHS.First->F, RHS.First->F,
+                                           GlobalNumbers);
+      return FCmp.compareIgnoringConsts() == -1;
+    }
+  };
+  using FnTreeType = std::set<EquivalenceClass, FunctionNodeCmp>;
+
+  ///
+  struct FunctionEntry {
+    FunctionEntry(Function *F, FnTreeType::iterator I)
+        : F(F), Next(nullptr), NumUnhandledCallees(0), TreeIter(I),
+          IsMerged(false) {}
+
+    /// Back-link to the function.
+    AssertingVH<Function> F;
+
+    /// The next function in its equivalence class.
+    FunctionEntry *Next;
+
+    /// The number of not-yet merged callees. Used to process the merging in
+    /// bottom-up call order.
+    /// This is only valid in the first entry of an equivalence class. The
+    /// counts of all functions in an equivalence class are accumulated in the
+    /// first entry.
+    int NumUnhandledCallees;
+
+    /// The iterator of the function's equivalence class in the FnTree.
+    /// It's FnTree.end() if the function is not in an equivalence class.
+    FnTreeType::iterator TreeIter;
+
+    /// True if this function is already a thunk, calling the merged function.
+    bool IsMerged;
+  };
+
+  /// Describes an operator of a specific instruction.
+  struct OpLocation {
+    Instruction *I;
+    unsigned OpIndex;
+  };
+
+  /// Information for a function. Used during merging.
+  struct FunctionInfo {
+
+    FunctionInfo(Function *F)
+        : F(F), CurrentInst(nullptr), NumParamsNeeded(0) {}
+
+    void init() {
+      CurrentInst = &*F->begin()->begin();
+      NumParamsNeeded = 0;
+    }
+
+    /// Advances the current instruction to the next instruction.
+    void nextInst() {
+      assert(CurrentInst);
+      if (CurrentInst->isTerminator()) {
+        auto BlockIter = std::next(CurrentInst->getParent()->getIterator());
+        if (BlockIter == F->end()) {
+          CurrentInst = nullptr;
+          return;
+        }
+        CurrentInst = &*BlockIter->begin();
+        return;
+      }
+      CurrentInst = &*std::next(CurrentInst->getIterator());
+    }
+
+    /// Returns true if the operand \p OpIdx of the current instruction is the
+    /// callee of a call, which needs to be signed if passed as a parameter.
+    bool needsPointerSigning(unsigned OpIdx) const {
+      if (auto *CI = dyn_cast<CallInst>(CurrentInst))
+        return isCalleeOperand(CI, OpIdx);
+      return false;
+    }
+
+    Function *F;
+
+    /// The current instruction while iterating over all instructions.
+    Instruction *CurrentInst;
+
+    /// Roughly the number of parameters needed if this function would be
+    /// merged with the first function of the equivalence class.
+    int NumParamsNeeded;
+  };
+
+  using FunctionInfos = SmallVector<FunctionInfo, 8>;
+
+  /// Describes a parameter which we create to parameterize the merged function.
+  struct ParamInfo {
+    /// The value of the parameter for all the functions in the equivalence
+    /// class.
+    SmallVector<Constant *, 8> Values;
+
+    /// All uses of the parameter in the merged function.
+    SmallVector<OpLocation, 16> Uses;
+
+    /// The Discriminator for pointer signing.
+    /// Only not null if needsPointerSigning is true.
+    ConstantInt *Discriminator = nullptr;
+
+    /// True if the value is a callee function, which needs to be signed if
+    /// passed as a parameter.
+    bool NeedsPointerSigning = false;
+
+    /// Checks if this parameter can be used to describe an operand in all
+    /// functions of the equivalence class. Returns true if all values match
+    /// the specific instruction operands in all functions.
+    bool matches(const FunctionInfos &FInfos, unsigned OpIdx,
+                 bool PtrAuthEnabled) const {
+      unsigned NumFuncs = FInfos.size();
+      assert(Values.size() == NumFuncs);
+      if (PtrAuthEnabled &&
+          NeedsPointerSigning != FInfos[0].needsPointerSigning(OpIdx)) {
+        return false;
+      }
+      for (unsigned Idx = 0; Idx < NumFuncs; ++Idx) {
+        const FunctionInfo &FI = FInfos[Idx];
+        Constant *C = cast<Constant>(FI.CurrentInst->getOperand(OpIdx));
+        if (Values[Idx] != C)
+          return false;
+      }
+      return true;
+    }
+
+    /// Computes the Discriminator for pointer signing.
+    void computeDiscriminator(LLVMContext &Context) {
+      assert(NeedsPointerSigning);
+      assert(!Discriminator);
+
+      /// Get a hash from the concatenated function names.
+      /// The hash is deterministic, because the order of values depends on the
+      /// order of functions in the module, which is itself deterministic.
+      /// Note that the hash is not part of the ABI, because it's purly used
+      /// for pointer authentication between a module-private caller-callee
+      /// pair.
+      std::string concatenatedCalleeNames;
+      for (Constant *value : Values) {
+        if (auto *GO = dyn_cast<GlobalObject>(value))
+          concatenatedCalleeNames += GO->getName();
+      }
+      uint64_t rawHash = stable_hash_combine_string(concatenatedCalleeNames);
+      IntegerType *discrTy = Type::getInt64Ty(Context);
+      Discriminator = ConstantInt::get(discrTy, (rawHash % 0xFFFF) + 1);
+    }
+  };
+
+  using ParamInfos = SmallVector<ParamInfo, 16>;
+
+  Module *CurrentModule = nullptr;
+
+  GlobalNumberState GlobalNumbers;
+
+  /// A work queue of functions that may have been modified and should be
+  /// analyzed again.
+  std::vector<WeakTrackingVH> Deferred;
+
+  /// The set of all distinct functions. Use the insert() and remove() methods
+  /// to modify it. The map allows efficient lookup and deferring of Functions.
+  FnTreeType FnTree;
+
+  ValueMap<Function *, FunctionEntry *> FuncEntries;
+
+  // Maps a function-pointer / Discriminator pair to a corresponding global in
+  // the llvm.ptrauth section.
+  // This map is used as a cache to not create ptrauth globals twice.
+  DenseMap<std::pair<Constant *, ConstantInt *>, Constant *> PtrAuthGlobals;
+
+  /// True if the architecture has pointer authentication enabled.
+  bool PtrAuthEnabled = false;
+
+  /// The key for pointer authentication.
+  unsigned PtrAuthKey = 0;
+
+  std::string MergeFuncSuffix = ".Tm";
+
+  FunctionEntry *getEntry(Function *F) const { return FuncEntries.lookup(F); }
+
+  bool isInEquivalenceClass(FunctionEntry *FE) const {
+    if (FE->TreeIter != FnTree.end()) {
+      return true;
+    }
+    assert(!FE->Next);
+    assert(FE->NumUnhandledCallees == 0);
+    return false;
+  }
+
+  /// Checks the rules of order relation introduced among functions set.
+  /// Returns true, if sanity check has been passed, and false if failed.
+  bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
+
+  /// Updates the NumUnhandledCallees of all user functions of the equivalence
+  /// class containing \p FE by \p Delta.
+  void updateUnhandledCalleeCount(FunctionEntry *FE, int Delta);
+
+  bool tryMergeEquivalenceClass(FunctionEntry *FirstInClass);
+
+  FunctionInfo removeFuncWithMostParams(FunctionInfos &FInfos);
+
+  bool deriveParams(ParamInfos &Params, FunctionInfos &FInfos,
+                    unsigned maxParams);
+
+  bool numOperandsDiffer(FunctionInfos &FInfos);
+
+  bool constsDiffer(const FunctionInfos &FInfos, unsigned OpIdx);
+
+  bool tryMapToParameter(FunctionInfos &FInfos, unsigned OpIdx,
+                         ParamInfos &Params, unsigned maxParams);
+
+  void replaceCallWithAddedPtrAuth(CallInst *origCall, Value *newCallee,
+                                   ConstantInt *Discriminator);
+
+  void mergeWithParams(const FunctionInfos &FInfos, ParamInfos &Params);
+  static void dumpMergeInfo(const FunctionInfos &FInfos, unsigned);
+
+  void removeEquivalenceClassFromTree(FunctionEntry *FE);
+
+  void writeThunk(Function *ToFunc, Function *Thunk, const ParamInfos &Params,
+                  unsigned FuncIdx);
+
+  bool isPtrAuthEnabled() const {
+    // TODO: fix pointer authentication
+    return PtrAuthEnabled;
+  }
+
+  ConstantInt *getPtrAuthKey() {
+    // TODO: fix pointer authentication
+    return ConstantInt::get(Type::getInt32Ty(CurrentModule->getContext()),
+                            PtrAuthKey);
+  }
+
+  /// Returns the value of function \p FuncIdx, and signes it if required.
+  Constant *getSignedValue(const ParamInfo &PI, unsigned FuncIdx) {
+    Constant *value = PI.Values[FuncIdx];
+    if (!PI.NeedsPointerSigning)
+      return value;
+
+    auto lookupKey = std::make_pair(value, PI.Discriminator);
+    Constant *&ptrAuthGlobal = PtrAuthGlobals[lookupKey];
+    if (!ptrAuthGlobal) {
+      // TODO: fix pointer authentication
+    }
+    return ptrAuthGlobal;
+  }
+
+  /// Replace all direct calls of Old with calls of New. Will bitcast New if
+  /// necessary to make types match.
+  bool replaceDirectCallers(Function *Old, Function *New,
+                            const ParamInfos &Params, unsigned FuncIdx);
+};
+
+} // end anonymous namespace
+
+bool MergeFuncIgnoringConstImpl::doSanityCheck(
+    std::vector<WeakTrackingVH> &Worklist) {
+  if (const unsigned Max = NumFunctionsIgnoringConstForSanityCheck) {
+    unsigned TripleNumber = 0;
+    bool Valid = true;
+
+    dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
+
+    unsigned i = 0;
+    for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(),
+                                               E = Worklist.end();
+         I != E && i < Max; ++I, ++i) {
+      unsigned j = i;
+      for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max;
+           ++J, ++j) {
+        Function *F1 = cast<Function>(*I);
+        Function *F2 = cast<Function>(*J);
+        int Res1 = FunctionComparatorIgnoringConst(F1, F2, &GlobalNumbers)
+                       .compareIgnoringConsts();
+        int Res2 = FunctionComparatorIgnoringConst(F2, F1, &GlobalNumbers)
+                       .compareIgnoringConsts();
+
+        // If F1 <= F2, then F2 >= F1, otherwise report failure.
+        if (Res1 != -Res2) {
+          dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
+                 << "\n";
+          LLVM_DEBUG(F1->dump());
+          LLVM_DEBUG(F2->dump());
+          Valid = false;
+        }
+
+        if (Res1 == 0)
+          continue;
+
+        unsigned k = j;
+        for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max;
+             ++k, ++K, ++TripleNumber) {
+          if (K == J)
+            continue;
+
+          Function *F3 = cast<Function>(*K);
+          int Res3 = FunctionComparatorIgnoringConst(F1, F3, &GlobalNumbers)
+                         .compareIgnoringConsts();
+          int Res4 = FunctionComparatorIgnoringConst(F2, F3, &GlobalNumbers)
+                         .compareIgnoringConsts();
+
+          bool Transitive = true;
+
+          if (Res1 != 0 && Res1 == Res4) {
+            // F1 > F2, F2 > F3 => F1 > F3
+            Transitive = Res3 == Res1;
+          } else if (Res3 != 0 && Res3 == -Res4) {
+            // F1 > F3, F3 > F2 => F1 > F2
+            Transitive = Res3 == Res1;
+          } else if (Res4 != 0 && -Res3 == Res4) {
+            // F2 > F3, F3 > F1 => F2 > F1
+            Transitive = Res4 == -Res1;
+          }
+
+          if (!Transitive) {
+            dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: "
+                   << TripleNumber << "\n";
+            dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
+                   << Res4 << "\n";
+            LLVM_DEBUG(F1->dump());
+            LLVM_DEBUG(F2->dump());
+            LLVM_DEBUG(F3->dump());
+            Valid = false;
+          }
+        }
+      }
+    }
+
+    dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n";
+    return Valid;
+  }
+  return true;
+}
+
+/// Returns true if functions containing calls to \p F may be merged together.
+static bool mayMergeCallsToFunction(Function &F) {
+  StringRef Name = F.getName();
+
+  // Calls to dtrace probes must generate unique patchpoints.
+  if (Name.startswith("__dtrace"))
+    return false;
+
+  return true;
+}
+
+/// Returns the benefit, which is approximately the size of the function.
+/// Return 0, if the function should not be merged.
+static unsigned getBenefit(Function *F) {
+  unsigned Benefit = 0;
+
+  // We don't want to merge very small functions, because the overhead of
+  // adding creating thunks and/or adding parameters to the call sites
+  // outweighs the benefit.
+  for (BasicBlock &BB : *F) {
+    for (Instruction &I : BB) {
+      if (CallBase *CB = dyn_cast<CallBase>(&I)) {
+        Function *Callee = CB->getCalledFunction();
+        if (Callee && !mayMergeCallsToFunction(*Callee))
+          return 0;
+        if (!Callee || !Callee->isIntrinsic()) {
+          Benefit += 5;
+          continue;
+        }
+      }
+      Benefit += 1;
+    }
+  }
+  return Benefit;
+}
+
+/// Returns true if function \p F is eligible for merging.
+bool isEligibleFunction(Function *F) {
+  if (F->isDeclaration())
+    return false;
+
+  if (F->hasFnAttribute(llvm::Attribute::NoMerge))
+    return false;
+
+  if (F->hasAvailableExternallyLinkage()) {
+    return false;
+  }
+
+  if (F->getFunctionType()->isVarArg()) {
+    return false;
+  }
+
+  // Check against blocklist.
+  if (!MergeBlockRegexFilters.empty()) {
+    StringRef FuncName = F->getName();
+    for (const auto &tRegex : MergeBlockRegexFilters)
+      if (Regex(tRegex).match(FuncName)) {
+        return false;
+      }
+  }
+  // Check against allowlist
+  if (!MergeAllowRegexFilters.empty()) {
+    StringRef FuncName = F->getName();
+    bool found = false;
+    for (const auto &tRegex : MergeAllowRegexFilters)
+      if (Regex(tRegex).match(FuncName)) {
+        found = true;
+        break;
+      }
+    if (!found)
+      return false;
+  }
+
+  if (F->getCallingConv() == CallingConv::SwiftTail)
+    return false;
+
+  // if function contains callsites with musttail, if we merge
+  // it, the merged function will have the musttail callsite, but
+  // the number of parameters can change, thus the parameter count
+  // of the callsite will mismatch with the function itself.
+  if (IgnoreMusttailFunction) {
+    for (const BasicBlock &BB : *F) {
+      for (const Instruction &I : BB) {
+        const auto *CB = dyn_cast<CallBase>(&I);
+        if (CB && CB->isMustTailCall())
+          return false;
+      }
+    }
+  }
+
+  unsigned Benefit = getBenefit(F);
+  if (Benefit < IgnoringConstMergeThreshold) {
+    return false;
+  }
+
+  return true;
+}
+
+bool MergeFuncIgnoringConstImpl::runImpl(Module &M) {
+  if (IgnoringConstMergeThreshold == 0)
+    return false;
+
+  CurrentModule = &M;
+
+  // TODO: fix pointer authentication
+
+  bool Changed = false;
+
+  // All functions in the module, ordered by hash. Functions with a unique
+  // hash value are easily eliminated.
+  std::vector<std::pair<llvm::IRHash, Function *>> HashedFuncs;
+
+  for (Function &Func : M) {
+    if (isEligibleFunction(&Func)) {
+      HashedFuncs.push_back({StructuralHash(Func), &Func});
+    }
+  }
+
+  std::stable_sort(HashedFuncs.begin(), HashedFuncs.end(),
+                   [](const std::pair<llvm::IRHash, Function *> &a,
+                      const std::pair<llvm::IRHash, Function *> &b) {
+                     return a.first < b.first;
+                   });
+
+  std::vector<FunctionEntry> FuncEntryStorage;
+  FuncEntryStorage.reserve(HashedFuncs.size());
+
+  auto S = HashedFuncs.begin();
+  for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) {
+
+    Function *F = I->second;
+    FuncEntryStorage.push_back(FunctionEntry(F, FnTree.end()));
+    FunctionEntry &FE = FuncEntryStorage.back();
+    FuncEntries[F] = &FE;
+
+    // If the hash value matches the previous value or the next one, we must
+    // consider merging it. Otherwise it is dropped and never considered again.
+    if ((I != S && std::prev(I)->first == I->first) ||
+        (std::next(I) != IE && std::next(I)->first == I->first)) {
+      Deferred.push_back(WeakTrackingVH(F));
+    }
+  }
+
+  do {
+    std::vector<WeakTrackingVH> Worklist;
+    Deferred.swap(Worklist);
+
+    LLVM_DEBUG(dbgs() << "======\nbuild tree: worklist-size=" << Worklist.size()
+                      << '\n');
+    LLVM_DEBUG(doSanityCheck(Worklist));
+
+    SmallVector<FunctionEntry *, 8> FuncsToMerge;
+
+    // Insert all candidates into the Worklist.
+    for (WeakTrackingVH &I : Worklist) {
+      if (!I)
+        continue;
+      Function *F = cast<Function>(I);
+      FunctionEntry *FE = getEntry(F);
+      assert(!isInEquivalenceClass(FE));
+
+      std::pair<FnTreeType::iterator, bool> Result = FnTree.insert(FE);
+
+      FE->TreeIter = Result.first;
+      const EquivalenceClass &Eq = *Result.first;
+
+      if (Result.second) {
+        assert(Eq.First == FE);
+        LLVM_DEBUG(dbgs() << "  new in tree: " << F->getName() << '\n');
+      } else {
+        assert(Eq.First != FE);
+        LLVM_DEBUG(dbgs() << "  add to existing: " << F->getName() << '\n');
+        // Add the function to the existing equivalence class.
+        FE->Next = Eq.First->Next;
+        Eq.First->Next = FE;
+        // Schedule for merging if the function's equivalence class reaches the
+        // size of 2.
+        if (!FE->Next)
+          FuncsToMerge.push_back(Eq.First);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "merge functions: tree-size=" << FnTree.size()
+                      << '\n');
+
+    // Figure out the leaf functions. We want to do the merging in bottom-up
+    // call order. This ensures that we don't parameterize on callee function
+    // names if we don't have to (because the callee may be merged).
+    // Note that "leaf functions" refer to the sub-call-graph of functions which
+    // are in the FnTree.
+    for (FunctionEntry *ToMerge : FuncsToMerge) {
+      assert(isInEquivalenceClass(ToMerge));
+      updateUnhandledCalleeCount(ToMerge, 1);
+    }
+
+    // Check if there are any leaf functions at all.
+    bool LeafFound = false;
+    for (FunctionEntry *ToMerge : FuncsToMerge) {
+      if (ToMerge->NumUnhandledCallees == 0)
+        LeafFound = true;
+    }
+    for (FunctionEntry *ToMerge : FuncsToMerge) {
+      if (isInEquivalenceClass(ToMerge)) {
+        // Only merge leaf functions (or all functions if all functions are in
+        // a call cycle).
+        if (ToMerge->NumUnhandledCallees == 0 || !LeafFound) {
+          updateUnhandledCalleeCount(ToMerge, -1);
+          Changed |= tryMergeEquivalenceClass(ToMerge);
+        } else {
+          // Non-leaf functions (i.e. functions in a call cycle) may become
+          // leaf functions in the next iteration.
+          removeEquivalenceClassFromTree(ToMerge);
+        }
+      }
+    }
+  } while (!Deferred.empty());
+
+  FnTree.clear();
+  GlobalNumbers.clear();
+  FuncEntries.clear();
+  PtrAuthGlobals.clear();
+
+  return Changed;
+}
+
+void MergeFuncIgnoringConstImpl::updateUnhandledCalleeCount(FunctionEntry *FE,
+                                                            int Delta) {
+  // Iterate over all functions of FE's equivalence class.
+  do {
+    for (Use &U : FE->F->uses()) {
+      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+        FunctionEntry *CallerFE = getEntry(I->getFunction());
+        if (CallerFE && CallerFE->TreeIter != FnTree.end()) {
+          // Accumulate the count in the first entry of the equivalence class.
+          FunctionEntry *Head = CallerFE->TreeIter->First;
+          Head->NumUnhandledCallees += Delta;
+        }
+      }
+    }
+    FE = FE->Next;
+  } while (FE);
+}
+
+bool MergeFuncIgnoringConstImpl::tryMergeEquivalenceClass(
+    FunctionEntry *FirstInClass) {
+  // Build the FInfos vector from all functions in the equivalence class.
+  FunctionInfos FInfos;
+  FunctionEntry *FE = FirstInClass;
+  do {
+    FInfos.push_back(FunctionInfo(FE->F));
+    FE->IsMerged = true;
+    FE = FE->Next;
+  } while (FE);
+  assert(FInfos.size() >= 2);
+
+  // Merged or not: in any case we remove the equivalence class from the FnTree.
+  removeEquivalenceClassFromTree(FirstInClass);
+
+  // Contains functions which differ too much from the first function (i.e.
+  // would need too many parameters).
+  FunctionInfos Removed;
+
+  bool Changed = false;
+  int Try = 0;
+
+  unsigned Benefit = getBenefit(FirstInClass->F);
+
+  // The bigger the function, the more parameters are allowed.
+  unsigned maxParams = std::max(4u, Benefit / 100);
+
+  // We need multiple tries if there are some functions in FInfos which differ
+  // too much from the first function in FInfos. But we limit the number of
+  // tries to a small number, because this is quadratic.
+  while (FInfos.size() >= 2 && Try++ < 4) {
+    ParamInfos Params;
+    bool Merged = deriveParams(Params, FInfos, maxParams);
+    if (Merged) {
+      mergeWithParams(FInfos, Params);
+      Changed = true;
+    } else {
+      // We ran out of parameters. Remove the function from the set which
+      // differs most from the first function.
+      Removed.push_back(removeFuncWithMostParams(FInfos));
+    }
+    if (Merged || FInfos.size() < 2) {
+      // Try again with the functions which were removed from the original set.
+      FInfos.swap(Removed);
+      Removed.clear();
+    }
+  }
+  return Changed;
+}
+
+/// Remove the function from \p FInfos which needs the most parameters. Add the
+/// removed function to
+MergeFuncIgnoringConstImpl::FunctionInfo
+MergeFuncIgnoringConstImpl::removeFuncWithMostParams(FunctionInfos &FInfos) {
+  FunctionInfos::iterator MaxIter = FInfos.end();
+  for (auto Iter = FInfos.begin(), End = FInfos.end(); Iter != End; ++Iter) {
+    if (MaxIter == FInfos.end() ||
+        Iter->NumParamsNeeded > MaxIter->NumParamsNeeded) {
+      MaxIter = Iter;
+    }
+  }
+  FunctionInfo Removed = *MaxIter;
+  FInfos.erase(MaxIter);
+  return Removed;
+}
+
+/// Finds the set of parameters which are required to merge the functions in
+/// \p FInfos.
+/// Returns true on success, i.e. the functions in \p FInfos can be merged with
+/// the parameters returned in \p Params.
+bool MergeFuncIgnoringConstImpl::deriveParams(ParamInfos &Params,
+                                              FunctionInfos &FInfos,
+                                              unsigned maxParams) {
+  for (FunctionInfo &FI : FInfos)
+    FI.init();
+
+  FunctionInfo &FirstFI = FInfos.front();
+
+  // Iterate over all instructions synchronously in all functions.
+  do {
+    if (isEligibleInstrunctionForConstantSharing(FirstFI.CurrentInst)) {
+
+      // Here we handle a rare corner case which needs to be explained:
+      // Usually the number of operands match, because otherwise the functions
+      // in FInfos would not be in the same equivalence class. There is only one
+      // exception to that: If the current instruction is a call to a function,
+      // which was merged in the previous iteration (in
+      // tryMergeEquivalenceClass) then the call could be replaced and has more
+      // arguments than the original call.
+      if (numOperandsDiffer(FInfos)) {
+        assert(isa<CallInst>(FirstFI.CurrentInst) &&
+               "only calls are expected to differ in number of operands");
+        return false;
+      }
+
+      for (unsigned OpIdx = 0, NumOps = FirstFI.CurrentInst->getNumOperands();
+           OpIdx != NumOps; ++OpIdx) {
+
+        if (constsDiffer(FInfos, OpIdx)) {
+          // This instruction has operands which differ in at least some
+          // functions. So we need to parameterize it.
+          if (!tryMapToParameter(FInfos, OpIdx, Params, maxParams)) {
+            // We ran out of parameters.
+            return false;
+          }
+        }
+      }
+    }
+    // Go to the next instruction in all functions.
+    for (FunctionInfo &FI : FInfos)
+      FI.nextInst();
+  } while (FirstFI.CurrentInst);
+
+  return true;
+}
+
+/// Returns true if the number of operands of the current instruction differs.
+bool MergeFuncIgnoringConstImpl::numOperandsDiffer(FunctionInfos &FInfos) {
+  unsigned numOps = FInfos[0].CurrentInst->getNumOperands();
+  for (const FunctionInfo &FI : ArrayRef<FunctionInfo>(FInfos).drop_front(1)) {
+    if (FI.CurrentInst->getNumOperands() != numOps)
+      return true;
+  }
+  return false;
+}
+
+/// Returns true if the \p OpIdx's constant operand in the current instruction
+/// does differ in any of the functions in \p FInfos.
+bool MergeFuncIgnoringConstImpl::constsDiffer(const FunctionInfos &FInfos,
+                                              unsigned OpIdx) {
+  Constant *CommonConst = nullptr;
+
+  for (const FunctionInfo &FI : FInfos) {
+    Value *Op = FI.CurrentInst->getOperand(OpIdx);
+    if (auto *C = dyn_cast<Constant>(Op)) {
+      if (!CommonConst) {
+        CommonConst = C;
+      } else if (EnableAggressiveMergeFunc &&
+                 isa<ConstantPointerNull>(CommonConst) &&
+                 isa<ConstantPointerNull>(C)) {
+        // if both are null pointer, and if they are different constants
+        // due to type, still treat them as the same.
+      } else if (C != CommonConst) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// Create a new parameter for differing operands or try to reuse an existing
+/// parameter.
+/// Returns true if a parameter could be created or found without exceeding the
+/// maximum number of parameters.
+bool MergeFuncIgnoringConstImpl::tryMapToParameter(FunctionInfos &FInfos,
+                                                   unsigned OpIdx,
+                                                   ParamInfos &Params,
+                                                   unsigned maxParams) {
+  ParamInfo *Matching = nullptr;
+  // Try to find an existing parameter which exactly matches the differing
+  // operands of the current instruction.
+  for (ParamInfo &PI : Params) {
+    if (PI.matches(FInfos, OpIdx, isPtrAuthEnabled())) {
+      Matching = &PI;
+      break;
+    }
+  }
+  if (!Matching) {
+    // We need a new parameter.
+    // Check if we are within the limit.
+    if (Params.size() >= maxParams)
+      return false;
+
+    Params.resize(Params.size() + 1);
+    Matching = &Params.back();
+    // Store the constant values into the new parameter.
+    Constant *FirstC = cast<Constant>(FInfos[0].CurrentInst->getOperand(OpIdx));
+    for (FunctionInfo &FI : FInfos) {
+      Constant *C = cast<Constant>(FI.CurrentInst->getOperand(OpIdx));
+      Matching->Values.push_back(C);
+      if (C != FirstC)
+        FI.NumParamsNeeded += 1;
+    }
+    if (isPtrAuthEnabled())
+      Matching->NeedsPointerSigning = FInfos[0].needsPointerSigning(OpIdx);
+  }
+  /// Remember where the parameter is needed when we build our merged function.
+  Matching->Uses.push_back({FInfos[0].CurrentInst, OpIdx});
+  return true;
+}
+
+/// Copy \p origCall with a \p newCalle and add a ptrauth bundle with \p
+/// Discriminator.
+void MergeFuncIgnoringConstImpl::replaceCallWithAddedPtrAuth(
+    CallInst *origCall, Value *newCallee, ConstantInt *Discriminator) {
+  SmallVector<llvm::OperandBundleDef, 4> bundles;
+  origCall->getOperandBundlesAsDefs(bundles);
+  ConstantInt *key = getPtrAuthKey();
+  llvm::Value *bundleArgs[] = {key, Discriminator};
+  bundles.emplace_back("ptrauth", bundleArgs);
+
+  SmallVector<llvm::Value *, 4> copiedArgs;
+  for (Value *op : origCall->args()) {
+    copiedArgs.push_back(op);
+  }
+
+  auto *newCall =
+      CallInst::Create(origCall->getFunctionType(), newCallee, copiedArgs,
+                       bundles, origCall->getName(), origCall);
+  newCall->setAttributes(origCall->getAttributes());
+  newCall->setTailCallKind(origCall->getTailCallKind());
+  newCall->setCallingConv(origCall->getCallingConv());
+  origCall->replaceAllUsesWith(newCall);
+  origCall->eraseFromParent();
+}
+
+void MergeFuncIgnoringConstImpl::dumpMergeInfo(const FunctionInfos &FInfos,
+                                               unsigned paramSize) {
+  std::set<llvm::IRHash> oHashes;
+  std::vector<std::string> funcLocs;
+  Function *OrigFunc = nullptr;
+  for (const auto &FInfo : FInfos) {
+    OrigFunc = FInfo.F;
+
+    llvm::IRHash origHash = StructuralHash(*OrigFunc);
+    oHashes.insert(origHash);
+
+    // Print debug location.
+    std::string Result;
+    raw_string_ostream DbgLocOS(Result);
+    if (DISubprogram *DIS = OrigFunc->getSubprogram()) {
+      DebugLoc FuncDbgLoc =
+          DILocation::get(DIS->getContext(), DIS->getScopeLine(), 0, DIS);
+      FuncDbgLoc.print(DbgLocOS);
+      DbgLocOS.flush();
+    }
+    std::string singleLine =
+        "# functionLoc " +
+        std::to_string(GlobalValue::getGUID(OrigFunc->getName())) + " " +
+        Result + " " + std::string(OrigFunc->getName()) + "\n";
+    funcLocs.push_back(singleLine);
+  }
+}
+
+/// Merge all functions in \p FInfos by creating thunks which call the single
+/// merged function with additional parameters.
+void MergeFuncIgnoringConstImpl::mergeWithParams(const FunctionInfos &FInfos,
+                                                 ParamInfos &Params) {
+  // We reuse the body of the first function for the new merged function.
+  Function *FirstF = FInfos.front().F;
+
+  // Build the type for the merged function. This will be the type of the
+  // original function (FirstF) but with the additional parameter which are
+  // needed to parameterize the merged function.
+  FunctionType *OrigTy = FirstF->getFunctionType();
+  SmallVector<Type *, 8> ParamTypes(OrigTy->param_begin(), OrigTy->param_end());
+
+  for (const ParamInfo &PI : Params) {
+    ParamTypes.push_back(PI.Values[0]->getType());
+  }
+
+  FunctionType *funcType =
+      FunctionType::get(OrigTy->getReturnType(), ParamTypes, false);
+
+  // Create the new function.
+  Function *NewFunction = Function::Create(funcType, FirstF->getLinkage(),
+                                           FirstF->getName() + MergeFuncSuffix);
+  if (auto *SP = FirstF->getSubprogram())
+    NewFunction->setSubprogram(SP);
+  NewFunction->copyAttributesFrom(FirstF);
+  // NOTE: this function is not externally available, do ensure that we reset
+  // the DLL storage
+  NewFunction->setDLLStorageClass(GlobalValue::DefaultStorageClass);
+  if (UseLinkOnceODRLinkageMerging)
+    NewFunction->setLinkage(GlobalValue::LinkOnceODRLinkage);
+  else
+    NewFunction->setLinkage(GlobalValue::InternalLinkage);
+  if (NoInlineForMergedFunction)
+    NewFunction->addFnAttr(Attribute::NoInline);
+
+  // Insert the new function after the last function in the equivalence class.
+  FirstF->getParent()->getFunctionList().insert(
+      std::next(FInfos[1].F->getIterator()), NewFunction);
+
+  LLVM_DEBUG(dbgs() << "  Merge into " << NewFunction->getName() << '\n');
+
+  // Move the body of FirstF into the NewFunction.
+  NewFunction->splice(NewFunction->begin(), FirstF);
+
+  auto NewArgIter = NewFunction->arg_begin();
+  for (Argument &OrigArg : FirstF->args()) {
+    Argument &NewArg = *NewArgIter++;
+    OrigArg.replaceAllUsesWith(&NewArg);
+  }
+  unsigned numOrigArgs = FirstF->arg_size();
+
+  SmallPtrSet<Function *, 8> SelfReferencingFunctions;
+
+  // Replace all differing operands with a parameter.
+  for (unsigned paramIdx = 0; paramIdx < Params.size(); ++paramIdx) {
+    const ParamInfo &PI = Params[paramIdx];
+    Argument *NewArg = NewFunction->getArg(numOrigArgs + paramIdx);
+
+    if (!PI.NeedsPointerSigning) {
+      for (const OpLocation &OL : PI.Uses) {
+        OL.I->setOperand(OL.OpIndex, NewArg);
+      }
+    }
+    // Collect all functions which are referenced by any parameter.
+    for (Value *V : PI.Values) {
+      if (auto *F = dyn_cast<Function>(V))
+        SelfReferencingFunctions.insert(F);
+    }
+  }
+
+  // Replace all differing operands, which need pointer signing, with a
+  // parameter.
+  // We need to do that after all other parameters, because here we replace
+  // call instructions, which must be live in case it has another constant to
+  // be replaced.
+  for (unsigned paramIdx = 0; paramIdx < Params.size(); ++paramIdx) {
+    ParamInfo &PI = Params[paramIdx];
+    if (PI.NeedsPointerSigning) {
+      PI.computeDiscriminator(NewFunction->getContext());
+      for (const OpLocation &OL : PI.Uses) {
+        auto *origCall = cast<CallInst>(OL.I);
+        Argument *newCallee = NewFunction->getArg(numOrigArgs + paramIdx);
+        replaceCallWithAddedPtrAuth(origCall, newCallee, PI.Discriminator);
+      }
+    }
+  }
+
+  for (unsigned FIdx = 0, NumFuncs = FInfos.size(); FIdx < NumFuncs; ++FIdx) {
+    Function *OrigFunc = FInfos[FIdx].F;
+    // Don't try to replace all callers of functions which are used as
+    // parameters because we must not delete such functions.
+    if (SelfReferencingFunctions.count(OrigFunc) == 0 &&
+        replaceDirectCallers(OrigFunc, NewFunction, Params, FIdx)) {
+      // We could replace all uses (and the function is not externally visible),
+      // so we can delete the original function.
+      auto Iter = FuncEntries.find(OrigFunc);
+      assert(Iter != FuncEntries.end());
+      assert(!isInEquivalenceClass(&*Iter->second));
+      Iter->second->F = nullptr;
+      FuncEntries.erase(Iter);
+      LLVM_DEBUG(dbgs() << "    Erase " << OrigFunc->getName() << '\n');
+      OrigFunc->eraseFromParent();
+    } else {
+      // Otherwise we need a thunk which calls the merged function.
+      writeThunk(NewFunction, OrigFunc, Params, FIdx);
+    }
+    ++NumFunctionsMergedIgnoringConst;
+  }
+}
+
+/// Remove all functions of \p FE's equivalence class from FnTree. Add them to
+/// Deferred so that we'll look at them in the next round.
+void MergeFuncIgnoringConstImpl::removeEquivalenceClassFromTree(
+    FunctionEntry *FE) {
+  if (!isInEquivalenceClass(FE))
+    return;
+
+  FnTreeType::iterator Iter = FE->TreeIter;
+  FunctionEntry *Unlink = Iter->First;
+  Unlink->NumUnhandledCallees = 0;
+  while (Unlink) {
+    LLVM_DEBUG(dbgs() << "    remove from tree: " << Unlink->F->getName()
+                      << '\n');
+    if (!Unlink->IsMerged)
+      Deferred.emplace_back(Unlink->F);
+    Unlink->TreeIter = FnTree.end();
+    assert(Unlink->NumUnhandledCallees == 0);
+    FunctionEntry *NextEntry = Unlink->Next;
+    Unlink->Next = nullptr;
+    Unlink = NextEntry;
+  }
+  FnTree.erase(Iter);
+}
+
+// Helper for writeThunk,
+// Selects proper bitcast operation,
+// but a bit simpler then CastInst::getCastOpcode.
+Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
+  if (SrcTy->isStructTy()) {
+    assert(DestTy->isStructTy());
+    assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
+    Value *Result = UndefValue::get(DestTy);
+    for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
+      Value *Element =
+          createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
+                     DestTy->getStructElementType(I));
+
+      Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
+    }
+    return Result;
+  }
+  assert(!DestTy->isStructTy());
+  if (CastArrayType) {
+    if (auto *SrcAT = dyn_cast<ArrayType>(SrcTy)) {
+      auto *DestAT = dyn_cast<ArrayType>(DestTy);
+      assert(DestAT);
+      assert(SrcAT->getNumElements() == DestAT->getNumElements());
+      Value *Result = UndefValue::get(DestTy);
+      for (unsigned int I = 0, E = SrcAT->getNumElements(); I < E; ++I) {
+        Value *Element =
+            createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
+                       DestAT->getElementType());
+
+        Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
+      }
+      return Result;
+    }
+    assert(!DestTy->isArrayTy());
+  }
+  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+    return Builder.CreateIntToPtr(V, DestTy);
+  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+    return Builder.CreatePtrToInt(V, DestTy);
+  else
+    return Builder.CreateBitCast(V, DestTy);
+}
+
+/// Replace \p Thunk with a simple tail call to \p ToFunc. Also add parameters
+/// to the call to \p ToFunc, which are defined by the FuncIdx's value in
+/// \p Params.
+void MergeFuncIgnoringConstImpl::writeThunk(Function *ToFunc, Function *Thunk,
+                                            const ParamInfos &Params,
+                                            unsigned FuncIdx) {
+  // Delete the existing content of Thunk.
+  Thunk->dropAllReferences();
+
+  BasicBlock *BB = BasicBlock::Create(Thunk->getContext(), "", Thunk);
+  IRBuilder<> Builder(BB);
+
+  SmallVector<Value *, 16> Args;
+  unsigned ParamIdx = 0;
+  FunctionType *ToFuncTy = ToFunc->getFunctionType();
+
+  // Add arguments which are passed through Thunk.
+  for (Argument &AI : Thunk->args()) {
+    Args.push_back(createCast(Builder, &AI, ToFuncTy->getParamType(ParamIdx)));
+    ++ParamIdx;
+  }
+  // Add new arguments defined by Params.
+  for (const ParamInfo &PI : Params) {
+    assert(ParamIdx < ToFuncTy->getNumParams());
+    Constant *param = getSignedValue(PI, FuncIdx);
+    Args.push_back(
+        createCast(Builder, param, ToFuncTy->getParamType(ParamIdx)));
+    ++ParamIdx;
+  }
+
+  CallInst *CI = Builder.CreateCall(ToFunc, Args);
+  bool isSwiftTailCall = ToFunc->getCallingConv() == CallingConv::SwiftTail &&
+                         Thunk->getCallingConv() == CallingConv::SwiftTail;
+  CI->setTailCallKind(isSwiftTailCall ? llvm::CallInst::TCK_MustTail
+                                      : llvm::CallInst::TCK_Tail);
+  CI->setCallingConv(ToFunc->getCallingConv());
+  CI->setAttributes(ToFunc->getAttributes());
+  if (Thunk->getReturnType()->isVoidTy()) {
+    Builder.CreateRetVoid();
+  } else {
+    Builder.CreateRet(createCast(Builder, CI, Thunk->getReturnType()));
+  }
+
+  LLVM_DEBUG(dbgs() << "    writeThunk: " << Thunk->getName() << '\n');
+  ++NumThunksWrittenIgnoringConst;
+}
+
+static llvm::AttributeList
+fixUpTypesInByValAndStructRetAttributes(llvm::FunctionType *fnType,
+                                        llvm::AttributeList attrList) {
+  auto &context = fnType->getContext();
+  if (!context.supportsTypedPointers())
+    return attrList;
+
+  for (unsigned i = 0; i < fnType->getNumParams(); ++i) {
+    auto paramTy = fnType->getParamType(i);
+    auto attrListIndex = llvm::AttributeList::FirstArgIndex + i;
+    if (attrList.hasParamAttr(i, llvm::Attribute::StructRet) &&
+        paramTy->getNonOpaquePointerElementType() !=
+            attrList.getParamStructRetType(i))
+      attrList = attrList.replaceAttributeTypeAtIndex(
+          context, attrListIndex, llvm::Attribute::StructRet,
+          paramTy->getNonOpaquePointerElementType());
+    if (attrList.hasParamAttr(i, llvm::Attribute::ByVal) &&
+        paramTy->getNonOpaquePointerElementType() !=
+            attrList.getParamByValType(i))
+      attrList = attrList.replaceAttributeTypeAtIndex(
+          context, attrListIndex, llvm::Attribute::ByVal,
+          paramTy->getNonOpaquePointerElementType());
+  }
+  return attrList;
+}
+
+/// Replace direct callers of Old with New. Also add parameters to the call to
+/// \p New, which are defined by the FuncIdx's value in \p Params.
+bool MergeFuncIgnoringConstImpl::replaceDirectCallers(Function *Old,
+                                                      Function *New,
+                                                      const ParamInfos &Params,
+                                                      unsigned FuncIdx) {
+  bool AllReplaced = true;
+
+  SmallVector<CallInst *, 8> Callers;
+
+  for (Use &U : Old->uses()) {
+    auto *I = dyn_cast<Instruction>(U.getUser());
+    if (!I) {
+      AllReplaced = false;
+      continue;
+    }
+    FunctionEntry *FE = getEntry(I->getFunction());
+    if (FE)
+      removeEquivalenceClassFromTree(FE);
+
+    auto *CI = dyn_cast<CallInst>(I);
+    if (!CI || CI->getCalledOperand() != Old) {
+      AllReplaced = false;
+      continue;
+    }
+    Callers.push_back(CI);
+  }
+  if (!AllReplaced)
+    return false;
+
+  // When AlwaysCallThunk is true, return false so a thunk will be emitted, also
+  // do not replace callsites.
+  if (AlwaysCallThunk)
+    return false;
+
+  for (CallInst *CI : Callers) {
+    auto &Context = New->getContext();
+    auto NewPAL = New->getAttributes();
+
+    SmallVector<Type *, 8> OldParamTypes;
+    SmallVector<Value *, 16> NewArgs;
+    SmallVector<AttributeSet, 8> NewArgAttrs;
+    IRBuilder<> Builder(CI);
+
+    FunctionType *NewFuncTy = New->getFunctionType();
+    (void)NewFuncTy;
+    unsigned ParamIdx = 0;
+
+    // Add the existing parameters.
+    for (Value *OldArg : CI->args()) {
+      NewArgAttrs.push_back(NewPAL.getParamAttrs(ParamIdx));
+      NewArgs.push_back(OldArg);
+      OldParamTypes.push_back(OldArg->getType());
+      ++ParamIdx;
+    }
+    // Add the new parameters.
+    for (const ParamInfo &PI : Params) {
+      assert(ParamIdx < NewFuncTy->getNumParams());
+      Constant *ArgValue = getSignedValue(PI, FuncIdx);
+      assert(ArgValue != Old && "should not try to replace all callers of self "
+                                "referencing functions");
+      NewArgs.push_back(ArgValue);
+      OldParamTypes.push_back(ArgValue->getType());
+      ++ParamIdx;
+    }
+
+    auto *FType = FunctionType::get(Old->getFunctionType()->getReturnType(),
+                                    OldParamTypes, false);
+    auto *FPtrType = PointerType::get(
+        FType, cast<PointerType>(New->getType())->getAddressSpace());
+
+    Value *Callee = ConstantExpr::getBitCast(New, FPtrType);
+    CallInst *NewCI;
+    if (objcarc::hasAttachedCallOpBundle(CI)) {
+      Value *BundleArgs[] = {*objcarc::getAttachedARCFunction(CI)};
+      OperandBundleDef OB("clang.arc.attachedcall", BundleArgs);
+      NewCI = Builder.CreateCall(FType, Callee, NewArgs, {OB});
+    } else {
+      NewCI = Builder.CreateCall(FType, Callee, NewArgs);
+    }
+    NewCI->setCallingConv(CI->getCallingConv());
+    // Don't transfer attributes from the function to the callee. Function
+    // attributes typically aren't relevant to the calling convention or ABI.
+    auto newAttrList = AttributeList::get(Context, /*FnAttrs=*/AttributeSet(),
+                                          NewPAL.getRetAttrs(), NewArgAttrs);
+    newAttrList = fixUpTypesInByValAndStructRetAttributes(FType, newAttrList);
+    NewCI->setAttributes(newAttrList);
+    if (IgnoreMusttailFunction && CI->isMustTailCall()) {
+      // replace a callsite with musttail.
+      llvm::errs() << "callsite has musttail in newF " << New->getName()
+                   << "\n";
+    }
+    NewCI->copyMetadata(*CI);
+    CI->replaceAllUsesWith(NewCI);
+    CI->eraseFromParent();
+  }
+  assert(Old->use_empty() && "should have replaced all uses of old function");
+  return Old->hasLocalLinkage();
+}
+
+PreservedAnalyses MergeFuncIgnoringConstPass::run(Module &M,
+                                                  ModuleAnalysisManager &MAM) {
+  if (MergeFuncIgnoringConstImpl(PtrAuthEnabled, PtrAuthKey, MergeFuncSuffix)
+          .runImpl(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index 51e8821773c3af3..9c320beb09711af 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_component_library(LLVMTransformUtils
   FixIrreducible.cpp
   FlattenCFG.cpp
   FunctionComparator.cpp
+  FunctionComparatorIgnoringConst.cpp
   FunctionImportUtils.cpp
   GlobalStatus.cpp
   GuardUtils.cpp
diff --git a/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp b/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp
new file mode 100644
index 000000000000000..9cfd95345598083
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp
@@ -0,0 +1,107 @@
+//===--- FunctionComparatorIgnoringConst.cpp - Function Comparator --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h"
+
+using namespace llvm;
+
+int FunctionComparatorIgnoringConst::cmpOperandsIgnoringConsts(
+    const Instruction *L, const Instruction *R, unsigned opIdx) {
+  Value *OpL = L->getOperand(opIdx);
+  Value *OpR = R->getOperand(opIdx);
+
+  int Res = cmpValues(OpL, OpR);
+  if (Res == 0)
+    return Res;
+
+  if (!isa<Constant>(OpL) || !isa<Constant>(OpR))
+    return Res;
+
+  if (!isEligibleOperandForConstantSharing(L, opIdx) ||
+      !isEligibleOperandForConstantSharing(R, opIdx))
+    return Res;
+
+  if (cmpTypes(OpL->getType(), OpR->getType()))
+    return Res;
+
+  return 0;
+}
+
+// Test whether two basic blocks have equivalent behavior.
+int FunctionComparatorIgnoringConst::cmpBasicBlocksIgnoringConsts(
+    const BasicBlock *BBL, const BasicBlock *BBR,
+    const std::set<std::pair<int, int>> *InstOpndIndex) {
+  BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
+  BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
+
+  do {
+    bool needToCmpOperands = true;
+    if (int Res = cmpOperations(&*InstL, &*InstR, needToCmpOperands))
+      return Res;
+    if (needToCmpOperands) {
+      assert(InstL->getNumOperands() == InstR->getNumOperands());
+
+      for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
+        // When a set for (instruction, operand) index pairs is given, we only
+        // ignore constants located at such indices. Otherwise, we precisely
+        // compare the operands.
+        if (InstOpndIndex && !InstOpndIndex->count(std::make_pair(Index, i))) {
+          Value *OpL = InstL->getOperand(i);
+          Value *OpR = InstR->getOperand(i);
+          if (int Res = cmpValues(OpL, OpR))
+            return Res;
+        }
+        if (int Res = cmpOperandsIgnoringConsts(&*InstL, &*InstR, i))
+          return Res;
+        // cmpValues should ensure this is true.
+        assert(cmpTypes(InstL->getOperand(i)->getType(),
+                        InstR->getOperand(i)->getType()) == 0);
+      }
+    }
+    ++Index;
+    ++InstL, ++InstR;
+  } while (InstL != InstLE && InstR != InstRE);
+
+  if (InstL != InstLE && InstR == InstRE)
+    return 1;
+  if (InstL == InstLE && InstR != InstRE)
+    return -1;
+  return 0;
+}
+
+// Test whether the two functions have equivalent behavior.
+int FunctionComparatorIgnoringConst::compareIgnoringConsts(
+    const std::set<std::pair<int, int>> *InstOpndIndex) {
+  beginCompare();
+  Index = 0;
+
+  if (int Res = compareSignature())
+    return Res;
+
+  Function::const_iterator LIter = FnL->begin(), LEnd = FnL->end();
+  Function::const_iterator RIter = FnR->begin(), REnd = FnR->end();
+
+  do {
+    const BasicBlock *BBL = &*LIter;
+    const BasicBlock *BBR = &*RIter;
+
+    if (int Res = cmpValues(BBL, BBR))
+      return Res;
+
+    if (int Res = cmpBasicBlocksIgnoringConsts(BBL, BBR, InstOpndIndex))
+      return Res;
+
+    ++LIter, ++RIter;
+  } while (LIter != LEnd && RIter != REnd);
+
+  return 0;
+}
diff --git a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll
new file mode 100644
index 000000000000000..1d84340da417235
--- /dev/null
+++ b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll
@@ -0,0 +1,532 @@
+; RUN: opt -S -mergefunc-ignoringconst-threshold=4 -passes=mergefunc-ignoring-const %s | FileCheck %s
+
+ at g1 = external global i32
+ at g2 = external global i32
+ at g3 = external global i32
+ at g4 = external global i32
+ at g5 = external global i32
+
+; Test the most trivial example.
+
+; CHECK-LABEL: define i32 @simple_func1(i32 %x, i32 %y)
+; CHECK: %1 = tail call i32 @simple_func1.Tm(i32 %x, i32 %y, ptr @g1)
+; CHECK: ret i32 %1
+define i32 @simple_func1(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %sum, %y
+  %l = load i32, i32* @g1, align 4
+  %sum3 = add i32 %sum2, %y
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define i32 @simple_func2(i32 %x, i32 %y)
+; CHECK: %1 = tail call i32 @simple_func1.Tm(i32 %x, i32 %y, ptr @g2)
+; CHECK: ret i32 %1
+define i32 @simple_func2(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %sum, %y
+  %l = load i32, i32* @g2, align 4
+  %sum3 = add i32 %sum2, %y
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define internal i32 @simple_func1.Tm(i32 %0, i32 %1, ptr %2)
+; CHECK: %l = load i32, ptr %2
+; CHECK: ret
+
+
+; Merge 3 functions with 3 types of differing instructions: load, store and call.
+
+; CHECK-LABEL: define i32 @func1_of_3(i32 %x)
+; CHECK: %1 = tail call i32 @func1_of_3.Tm(i32 %x, ptr @g1, ptr @g1, ptr @callee1)
+; CHECK: ret i32 %1
+define i32 @func1_of_3(i32 %x) {
+  %l1 = load i32, i32* @g1, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g1, align 4
+  %sum2 = add i32 %sum, %l2
+  store i32 %sum2, i32 *@g1, align 4
+  call void @callee1(i32 %sum2)
+  %sum3 = add i32 %sum2, %l2
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define i32 @func2_of_3(i32 %x)
+; CHECK: %1 = tail call i32 @func1_of_3.Tm(i32 %x, ptr @g2, ptr @g2, ptr @callee2)
+; CHECK: ret i32 %1
+define i32 @func2_of_3(i32 %x) {
+  %l1 = load i32, i32* @g2, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g2, align 4
+  %sum2 = add i32 %sum, %l2
+  store i32 %sum2, i32 *@g2, align 4
+  call void @callee2(i32 %sum2)
+  %sum3 = add i32 %sum2, %l2
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define i32 @func3_of_3(i32 %x)
+; CHECK: %1 = tail call i32 @func1_of_3.Tm(i32 %x, ptr @g3, ptr @g1, ptr @callee3)
+; CHECK: ret i32 %1
+define i32 @func3_of_3(i32 %x) {
+  %l1 = load i32, i32* @g3, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g1, align 4
+  %sum2 = add i32 %sum, %l2
+  store i32 %sum2, i32 *@g3, align 4
+  call void @callee3(i32 %sum2)
+  %sum3 = add i32 %sum2, %l2
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define internal i32 @func1_of_3.Tm(i32 %0, ptr %1, ptr %2, ptr %3)
+; CHECK: %l1 = load i32, ptr %1
+; CHECK: %l2 = load i32, ptr %2
+; CHECK: store i32 %sum2, ptr %1
+; CHECK: call void %3(i32 %sum2)
+; CHECK: ret
+
+declare void @callee1(i32 %x)
+declare void @callee2(i32 %x)
+declare void @callee3(i32 %x)
+
+; Preserve attributes
+
+; CHECK-LABEL: define void @sret_func1(ptr sret(i32) %p, i32 %x, i32 %y)
+; CHECK: tail call void @sret_func1.Tm(ptr sret(i32) %p, i32 %x, i32 %y, ptr @g1)
+; CHECK: ret void
+define void @sret_func1(i32* sret(i32) %p, i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %l = load i32, i32* @g1, align 4
+  %sum2 = add i32 %sum, %l
+  store i32 %sum2, i32* %p
+  ret void
+}
+
+; CHECK-LABEL: define void @sret_func2(ptr sret(i32) %p, i32 %x, i32 %y)
+; CHECK: tail call void @sret_func1.Tm(ptr sret(i32) %p, i32 %x, i32 %y, ptr @g2)
+; CHECK: ret void
+define void @sret_func2(i32* sret(i32) %p, i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %l = load i32, i32* @g2, align 4
+  %sum2 = add i32 %sum, %l
+  store i32 %sum2, i32* %p
+  ret void
+}
+
+; CHECK-LABEL: define internal void @sret_func1.Tm(ptr sret(i32) %0, i32 %1, i32 %2, ptr %3)
+; CHECK: %l = load i32, ptr %3, align 4
+; CHECK: store i32 %sum2, ptr %0
+; CHECK: ret
+
+
+; Don't merge all functions, because we would generate too many parameters.
+; Instead merge those functions which match best.
+
+; CHECK-LABEL: define i32 @func1_merged_with3(i32 %x)
+; CHECK: %1 = tail call i32 @func1_merged_with3.Tm(i32 %x, ptr @g1)
+; CHECK: ret i32 %1
+define i32 @func1_merged_with3(i32 %x) {
+  %l1 = load i32, i32* @g1, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g2, align 4
+  %sum2 = add i32 %sum, %l2
+  %l3 = load i32, i32* @g3, align 4
+  %sum3 = add i32 %sum2, %l2
+  %l4 = load i32, i32* @g4, align 4
+  %sum4 = add i32 %sum3, %l2
+  %l5 = load i32, i32* @g5, align 4
+  %sum5 = add i32 %sum4, %l2
+  ret i32 %sum5
+}
+
+; CHECK-LABEL: define i32 @func2_merged_with4(i32 %x)
+; CHECK: %1 = tail call i32 @func2_merged_with4.Tm(i32 %x, ptr @g2)
+; CHECK: ret i32 %1
+define i32 @func2_merged_with4(i32 %x) {
+  %l1 = load i32, i32* @g2, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g3, align 4
+  %sum2 = add i32 %sum, %l2
+  %l3 = load i32, i32* @g4, align 4
+  %sum3 = add i32 %sum2, %l2
+  %l4 = load i32, i32* @g5, align 4
+  %sum4 = add i32 %sum3, %l2
+  %l5 = load i32, i32* @g1, align 4
+  %sum5 = add i32 %sum4, %l2
+  ret i32 %sum5
+}
+
+; CHECK-LABEL: define i32 @func3_merged_with1(i32 %x)
+; CHECK: %1 = tail call i32 @func1_merged_with3.Tm(i32 %x, ptr @g2)
+; CHECK: ret i32 %1
+define i32 @func3_merged_with1(i32 %x) {
+  %l1 = load i32, i32* @g2, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g2, align 4
+  %sum2 = add i32 %sum, %l2
+  %l3 = load i32, i32* @g3, align 4
+  %sum3 = add i32 %sum2, %l2
+  %l4 = load i32, i32* @g4, align 4
+  %sum4 = add i32 %sum3, %l2
+  %l5 = load i32, i32* @g5, align 4
+  %sum5 = add i32 %sum4, %l2
+  ret i32 %sum5
+}
+
+; CHECK-LABEL: define internal i32 @func1_merged_with3.Tm(i32 %0, ptr %1)
+; CHECK: load i32, ptr %1, align 4
+; CHECK: load i32, ptr @g2, align 4
+; CHECK: load i32, ptr @g3, align 4
+; CHECK: load i32, ptr @g4, align 4
+; CHECK: load i32, ptr @g5, align 4
+; CHECK: ret i32
+
+; CHECK-LABEL: define i32 @func4_merged_with2(i32 %x) {
+; CHECK: %1 = tail call i32 @func2_merged_with4.Tm(i32 %x, ptr @g1)
+; CHECK: ret i32 %1
+define i32 @func4_merged_with2(i32 %x) {
+  %l1 = load i32, i32* @g1, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g3, align 4
+  %sum2 = add i32 %sum, %l2
+  %l3 = load i32, i32* @g4, align 4
+  %sum3 = add i32 %sum2, %l2
+  %l4 = load i32, i32* @g5, align 4
+  %sum4 = add i32 %sum3, %l2
+  %l5 = load i32, i32* @g1, align 4
+  %sum5 = add i32 %sum4, %l2
+  ret i32 %sum5
+}
+
+
+; The same example as above, but we cannot merge func2 with func4, because
+; func4 calls func1 (which is merged with func2 in the first iteration).
+
+declare i32 @get_int(i32 %x)
+
+; CHECK-LABEL: define i32 @Function1_merged_with_3(i32 %x)
+; CHECK: %1 = tail call i32 @Function1_merged_with_3.Tm(i32 %x, ptr @g1)
+; CHECK: ret i32 %1
+define i32 @Function1_merged_with_3(i32 %x) {
+  %l1 = load i32, i32* @g1, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g2, align 4
+  %sum2 = add i32 %sum, %l2
+  %l3 = load i32, i32* @g3, align 4
+  %sum3 = add i32 %sum2, %l2
+  %l4 = load i32, i32* @g4, align 4
+  %sum4 = add i32 %sum3, %l2
+  %l5 = load i32, i32* @g5, align 4
+  %sum5 = add i32 %sum4, %l2
+  %c = call fastcc i32 @get_int(i32 %sum5)
+  ret i32 %c
+}
+
+; CHECK-LABEL: define i32 @Function2_not_merged(i32 %x)
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: %c = call fastcc i32 @get_int
+; CHECK: ret i32 %c
+define i32 @Function2_not_merged(i32 %x) {
+  %l1 = load i32, i32* @g2, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g3, align 4
+  %sum2 = add i32 %sum, %l2
+  %l3 = load i32, i32* @g4, align 4
+  %sum3 = add i32 %sum2, %l2
+  %l4 = load i32, i32* @g5, align 4
+  %sum4 = add i32 %sum3, %l2
+  %l5 = load i32, i32* @g1, align 4
+  %sum5 = add i32 %sum4, %l2
+  %c = call fastcc i32 @get_int(i32 %sum5)
+  ret i32 %c
+}
+
+; CHECK-LABEL: define i32 @Function3_merged_with_1(i32 %x)
+; CHECK: %1 = tail call i32 @Function1_merged_with_3.Tm(i32 %x, ptr @g2)
+; CHECK: ret i32 %1
+define i32 @Function3_merged_with_1(i32 %x) {
+  %l1 = load i32, i32* @g2, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g2, align 4
+  %sum2 = add i32 %sum, %l2
+  %l3 = load i32, i32* @g3, align 4
+  %sum3 = add i32 %sum2, %l2
+  %l4 = load i32, i32* @g4, align 4
+  %sum4 = add i32 %sum3, %l2
+  %l5 = load i32, i32* @g5, align 4
+  %sum5 = add i32 %sum4, %l2
+  %c = call fastcc i32 @get_int(i32 %sum5)
+  ret i32 %c
+}
+
+; CHECK-LABEL: define internal i32 @Function1_merged_with_3.Tm(i32 %0, ptr %1)
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: %c = call fastcc i32 @get_int
+; CHECK: ret i32 %c
+
+; CHECK-LABEL: define i32 @Function4_not_merged(i32 %x) {
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: load
+; CHECK: %1 = call fastcc i32 @Function1_merged_with_3.Tm(i32 %sum5, ptr @g1)
+; CHECK: ret i32 %1
+define i32 @Function4_not_merged(i32 %x) {
+  %l1 = load i32, i32* @g1, align 4
+  %sum = add i32 %x, %l1
+  %l2 = load i32, i32* @g3, align 4
+  %sum2 = add i32 %sum, %l2
+  %l3 = load i32, i32* @g4, align 4
+  %sum3 = add i32 %sum2, %l2
+  %l4 = load i32, i32* @g5, align 4
+  %sum4 = add i32 %sum3, %l2
+  %l5 = load i32, i32* @g1, align 4
+  %sum5 = add i32 %sum4, %l2
+  %c = call fastcc i32 @Function1_merged_with_3(i32 %sum5)
+  ret i32 %c
+}
+
+
+; Test a call chain: caller -> callee1 -> callee2.
+; Functions should be merged in bottom-up order: callee2, callee1, caller.
+; Also check that the calling convention is preserved.
+
+; CHECK-LABEL: define fastcc i32 @callee1_a(i32 %x, i32 %y)
+; CHECK: %1 = tail call fastcc i32 @callee1_a.Tm(i32 %x, i32 %y, ptr @g1)
+; CHECK: ret i32 %1
+define fastcc i32 @callee1_a(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %sum, %y
+  %c = call i32 @callee2_a(i32 %sum2, i32 %y)
+  %sum3 = add i32 %sum2, %c
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define fastcc i32 @callee1_b(i32 %x, i32 %y)
+; CHECK: %1 = tail call fastcc i32 @callee1_a.Tm(i32 %x, i32 %y, ptr @g2)
+; CHECK: ret i32 %1
+define fastcc i32 @callee1_b(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %sum, %y
+  %c = call i32 @callee2_b(i32 %sum2, i32 %y)
+  %sum3 = add i32 %sum2, %c
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define internal fastcc i32 @callee1_a.Tm(i32 %0, i32 %1, ptr %2)
+; CHECK: call i32 @callee2_a.Tm(i32 %sum2, i32 %1, ptr %2)
+; CHECK: ret
+
+; CHECK-NOT: @callee2_a(
+define internal i32 @callee2_a(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = sub i32 %sum, %y
+  %l = load i32, i32* @g1, align 4
+  %sum3 = add i32 %sum2, %y
+  ret i32 %sum3
+}
+
+; CHECK-NOT: @callee2_b(
+define internal i32 @callee2_b(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = sub i32 %sum, %y
+  %l = load i32, i32* @g2, align 4
+  %sum3 = add i32 %sum2, %y
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define i32 @caller_a(i32 %x, i32 %y)
+; CHECK: %1 = tail call i32 @caller_a.Tm(i32 %x, i32 %y, ptr @g1)
+; CHECK: ret i32 %1
+define i32 @caller_a(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %sum, %y
+  %c = call fastcc i32 @callee1_a(i32 %sum2, i32 %y)
+  %sum3 = add i32 %sum2, %c
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define i32 @caller_b(i32 %x, i32 %y)
+; CHECK: %1 = tail call i32 @caller_a.Tm(i32 %x, i32 %y, ptr @g2)
+; CHECK: ret i32 %1
+define i32 @caller_b(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %sum, %y
+  %c = call fastcc i32 @callee1_b(i32 %sum2, i32 %y)
+  %sum3 = add i32 %sum2, %c
+  ret i32 %sum3
+}
+
+; CHECK-LABEL: define internal i32 @caller_a.Tm(i32 %0, i32 %1, ptr %2)
+; CHECK: call fastcc i32 @callee1_a.Tm(i32 %sum2, i32 %1, ptr %2)
+; CHECK: ret
+
+
+; Ensure that we do not merge functions that are identical with the
+; exception of the order of the incoming blocks to a phi.
+
+; CHECK-LABEL: define linkonce_odr hidden i1 @first(i2 %0)
+define linkonce_odr hidden i1 @first(i2) {
+entry:
+; CHECK: switch i2
+  switch i2 %0, label %default [
+    i2 0, label %L1
+    i2 1, label %L2
+    i2 -2, label %L3
+  ]
+default:
+  unreachable
+L1:
+  br label %done
+L2:
+  br label %done
+L3:
+  br label %done
+done:
+  %result = phi i1 [ true, %L1 ], [ false, %L2 ], [ false, %L3 ]
+; CHECK: ret i1
+  ret i1 %result
+}
+
+; CHECK-LABEL: define linkonce_odr hidden i1 @second(i2 %0)
+define linkonce_odr hidden i1 @second(i2) {
+entry:
+; CHECK: switch i2
+  switch i2 %0, label %default [
+    i2 0, label %L1
+    i2 1, label %L2
+    i2 -2, label %L3
+  ]
+default:
+  unreachable
+L1:
+  br label %done
+L2:
+  br label %done
+L3:
+  br label %done
+done:
+  %result = phi i1 [ true, %L3 ], [ false, %L2 ], [ false, %L1 ]
+; CHECK: ret i1
+  ret i1 %result
+}
+
+; Check self recursive functions
+
+; CHECK-LABEL: define internal void @recursive1(i32 %x, i32 %y)
+; CHECK: tail call void @recursive1.Tm(i32 %x, i32 %y, ptr @g1, ptr @recursive1)
+; CHECK: ret void
+define internal void @recursive1(i32 %x, i32 %y) {
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  %l = load i32, i32* @g1, align 4
+  call void @recursive1(i32 %x, i32 %y)
+  br label %bb2
+
+bb2:
+  ret void
+}
+
+; CHECK-LABEL: define internal void @recursive2(i32 %x, i32 %y)
+; CHECK: tail call void @recursive1.Tm(i32 %x, i32 %y, ptr @g2, ptr @recursive2)
+; CHECK: ret void
+define internal void @recursive2(i32 %x, i32 %y) {
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  %l = load i32, i32* @g2, align 4
+  call void @recursive2(i32 %x, i32 %y)
+  br label %bb2
+
+bb2:
+  ret void
+}
+; CHECK-LABEL: define internal void @recursive1.Tm(i32 %0, i32 %1, ptr %2, ptr %3)
+; CHECK: load i32, ptr %2
+; CHECK: call void %3(i32 %0, i32 %1)
+; CHECK: ret void
+
+
+; CHECK-LABEL: define internal void @another_recursive_func(i32 %x)
+; CHECK: tail call void @another_recursive_func.Tm(i32 %x, ptr @g1, ptr @another_recursive_func)
+; CHECK: ret void
+define internal void @another_recursive_func(i32 %x) {
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  store i32 %x, i32 *@g1, align 4
+  call void @another_recursive_func(i32 %x)
+  br label %bb2
+
+bb2:
+  ret void
+}
+; CHECK-NOT: @not_really_recursive(
+
+; CHECK-LABEL: define internal void @another_recursive_func.Tm(i32 %0, ptr %1, ptr %2)
+; CHECK: store i32 %0, ptr %1
+; CHECK: call void %2(i32 %0)
+; CHECK: ret void
+define internal void @not_really_recursive(i32 %x) {
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  store i32 %x, i32 *@g2, align 4
+  call void @callee1(i32 %x)
+  br label %bb2
+
+bb2:
+  ret void
+}
+; CHECK-NOT: @not_really_recursive(
+
+; CHECK-LABEL: define void @call_recursive_funcs(i32 %x)
+; CHECK: call void @recursive1(i32 %x, i32 %x)
+; CHECK: call void @recursive2(i32 %x, i32 %x)
+; CHECK: call void @another_recursive_func(i32 %x)
+; CHECK: call void @another_recursive_func.Tm(i32 %x, ptr @g2, ptr @callee1)
+; CHECK: ret void
+define void @call_recursive_funcs(i32 %x) {
+  call void @recursive1(i32 %x, i32 %x)
+  call void @recursive2(i32 %x, i32 %x)
+  call void @another_recursive_func(i32 %x)
+  call void @not_really_recursive(i32 %x)
+  ret void
+}
+
+; Ensure that we do not merge functions which make use of distinct dtrace
+; probes. Each call to a dtrace probe must resolve to a unique patchpoint.
+
+declare void @"__dtrace_probe$Apple$Probe1$v1$696e74"(i32) local_unnamed_addr
+
+; CHECK-LABEL: define i32 @use_dtrace_probe1
+; CHECK: call void @"__dtrace_probe$Apple$Probe1$v1$696e74"
+define i32 @use_dtrace_probe1(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %sum, %y
+  %l = load i32, i32* @g1, align 4
+  %sum3 = add i32 %sum2, %y
+  tail call void @"__dtrace_probe$Apple$Probe1$v1$696e74"(i32 undef)
+  ret i32 %sum3
+}
+
+declare void @"__dtrace_probe$Apple$Probe2$v1$696e74"(i32) local_unnamed_addr
+
+; CHECK-LABEL: define i32 @use_dtrace_probe2
+; CHECK: call void @"__dtrace_probe$Apple$Probe2$v1$696e74"
+define i32 @use_dtrace_probe2(i32 %x, i32 %y) {
+  %sum = add i32 %x, %y
+  %sum2 = add i32 %sum, %y
+  %l = load i32, i32* @g2, align 4
+  %sum3 = add i32 %sum2, %y
+  tail call void @"__dtrace_probe$Apple$Probe2$v1$696e74"(i32 undef)
+  ret i32 %sum3
+}
diff --git a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll
new file mode 100644
index 000000000000000..c5c8b898c046e51
--- /dev/null
+++ b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll
@@ -0,0 +1,190 @@
+; RUN: opt -S -enable-aggressive-mergefunc-ignoringconst -passes=mergefunc-ignoring-const %s -o - | FileCheck %s
+
+%4 = type opaque
+%10 = type opaque
+%"struct.SearchSpec::State" = type { %4* }
+%"struct.PointerList" = type { i8*, i8*, i8*, i8*, i8* }
+%"struct.DynamicCallback" = type { %10* }
+
+; CHECK: define ptr @invoke_foo(ptr nocapture readonly %.block_descriptor, ptr %stateWrapper)
+; CHECK: %1 = {{.*}}call ptr @invoke_foo.Tm
+; CHECK: define ptr @invoke_bar(ptr nocapture readonly %.block_descriptor, ptr %stateWrapper) {
+; CHECK: %1 = {{.*}}call ptr @invoke_foo.Tm
+; CHECK: define {{.*}}.Tm(ptr nocapture readonly %0, ptr %1, ptr %2, ptr %3)
+
+; Function Attrs: minsize optsize ssp uwtable
+define i8* @invoke_foo(i8* nocapture readonly %.block_descriptor, i8* %stateWrapper) #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %state = alloca %"struct.SearchSpec::State", align 8
+  %agg.tmp = alloca %"struct.PointerList", align 8
+  %0 = tail call i8* @llvm.objc.retain(i8* %stateWrapper) #2
+  %1 = bitcast %"struct.SearchSpec::State"* %state to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #2
+  %2 = getelementptr inbounds i8, i8* %stateWrapper, i64 16
+  %3 = bitcast i8* %2 to %"struct.SearchSpec::State"* (i8*)**
+  %4 = load %"struct.SearchSpec::State"* (i8*)*, %"struct.SearchSpec::State"* (i8*)** %3, align 8
+  %call.i4 = invoke nonnull align 8 dereferenceable(8) %"struct.SearchSpec::State"* %4(i8* nonnull %stateWrapper) #31
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %initialText.i.i = getelementptr inbounds %"struct.SearchSpec::State", %"struct.SearchSpec::State"* %state, i64 0, i32 0
+  %initialText2.i.i = getelementptr inbounds %"struct.SearchSpec::State", %"struct.SearchSpec::State"* %call.i4, i64 0, i32 0
+  %5 = load %4*, %4** %initialText2.i.i, align 8
+  %6 = bitcast %4* %5 to i8*
+  %7 = tail call i8* @llvm.objc.retain(i8* %6) #2
+  store %4* %5, %4** %initialText.i.i, align 8
+  %block.capture.addr = getelementptr inbounds i8, i8* %.block_descriptor, i64 32
+  %8 = bitcast i8* %block.capture.addr to i8**
+  %9 = load i8*, i8** %8, align 8
+  invoke void @callee2(%"struct.PointerList"* nonnull sret(%"struct.PointerList") align 8 %agg.tmp, i8* %9, i1 zeroext false) #31
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %invoke.cont
+  %block.capture.addr3 = getelementptr inbounds i8, i8* %.block_descriptor, i64 40
+  %10 = bitcast i8* %block.capture.addr3 to %4**
+  %agg.tmp6.sroa.3.0..sroa_idx12 = getelementptr inbounds %"struct.PointerList", %"struct.PointerList"* %agg.tmp, i64 0, i32 3
+  %agg.tmp6.sroa.3.0.copyload = load i8*, i8** %agg.tmp6.sroa.3.0..sroa_idx12, align 8
+  %11 = load %4*, %4** %10, align 8
+  invoke void @callee1(%"struct.SearchSpec::State"* nonnull align 8 dereferenceable(8) %state, %4* %11) #31
+          to label %invoke.cont4 unwind label %lpad.i
+
+lpad.i:                                           ; preds = %invoke.cont2
+  %12 = landingpad { i8*, i32 }
+          cleanup
+  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
+  %.phi.trans.insert = bitcast %"struct.SearchSpec::State"* %state to i8**
+  %.pre = load i8*, i8** %.phi.trans.insert, align 8
+  br label %lpad1.body
+
+invoke.cont4:                                     ; preds = %invoke.cont2
+  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
+  %13 = load %4*, %4** %initialText.i.i, align 8
+  store %4* null, %4** %initialText.i.i, align 8
+  %call78 = call fastcc i8* @callee3(%4* %13) #31 [ "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
+  call void (...) @llvm.objc.clang.arc.noop.use(i8* %call78) #2
+  %14 = bitcast %"struct.SearchSpec::State"* %state to i8**
+  %15 = load i8*, i8** %14, align 8
+  call void @llvm.objc.release(i8* %15) #2
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
+  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
+  %16 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %call78) #2
+  ret i8* %call78
+
+lpad:                                             ; preds = %entry
+  %17 = landingpad { i8*, i32 }
+          cleanup
+  br label %ehcleanup
+
+lpad1:                                            ; preds = %invoke.cont
+  %18 = landingpad { i8*, i32 }
+          cleanup
+  br label %lpad1.body
+
+lpad1.body:                                       ; preds = %lpad1, %lpad.i
+  %19 = phi i8* [ %6, %lpad1 ], [ %.pre, %lpad.i ]
+  %eh.lpad-body = phi { i8*, i32 } [ %18, %lpad1 ], [ %12, %lpad.i ]
+  call void @llvm.objc.release(i8* %19) #2
+  br label %ehcleanup
+
+ehcleanup:                                        ; preds = %lpad1.body, %lpad
+  %.pn = phi { i8*, i32 } [ %eh.lpad-body, %lpad1.body ], [ %17, %lpad ]
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
+  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
+  resume { i8*, i32 } %.pn
+}
+
+; Function Attrs: minsize optsize ssp uwtable
+define i8* @invoke_bar(i8* nocapture readonly %.block_descriptor, i8* %stateWrapper) #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %state = alloca %"struct.DynamicCallback", align 8
+  %agg.tmp = alloca %"struct.PointerList", align 8
+  %0 = tail call i8* @llvm.objc.retain(i8* %stateWrapper) #2
+  %1 = bitcast %"struct.DynamicCallback"* %state to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #2
+  %2 = getelementptr inbounds i8, i8* %stateWrapper, i64 16
+  %3 = bitcast i8* %2 to %"struct.DynamicCallback"* (i8*)**
+  %4 = load %"struct.DynamicCallback"* (i8*)*, %"struct.DynamicCallback"* (i8*)** %3, align 8
+  %call.i4 = invoke nonnull align 8 dereferenceable(8) %"struct.DynamicCallback"* %4(i8* nonnull %stateWrapper) #31
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %call.i.i = getelementptr inbounds %"struct.DynamicCallback", %"struct.DynamicCallback"* %state, i64 0, i32 0
+  %call2.i.i = getelementptr inbounds %"struct.DynamicCallback", %"struct.DynamicCallback"* %call.i4, i64 0, i32 0
+  %5 = load %10*, %10** %call2.i.i, align 8
+  %6 = bitcast %10* %5 to i8*
+  %7 = tail call i8* @llvm.objc.retain(i8* %6) #2
+  store %10* %5, %10** %call.i.i, align 8
+  %block.capture.addr = getelementptr inbounds i8, i8* %.block_descriptor, i64 32
+  %8 = bitcast i8* %block.capture.addr to i8**
+  %9 = load i8*, i8** %8, align 8
+  invoke void @callee2(%"struct.PointerList"* nonnull sret(%"struct.PointerList") align 8 %agg.tmp, i8* %9, i1 zeroext false) #31
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %invoke.cont
+  %block.capture.addr3 = getelementptr inbounds i8, i8* %.block_descriptor, i64 40
+  %10 = bitcast i8* %block.capture.addr3 to %10**
+  %agg.tmp6.sroa.3.0..sroa_idx12 = getelementptr inbounds %"struct.PointerList", %"struct.PointerList"* %agg.tmp, i64 0, i32 3
+  %agg.tmp6.sroa.3.0.copyload = load i8*, i8** %agg.tmp6.sroa.3.0..sroa_idx12, align 8
+  %11 = load %10*, %10** %10, align 8
+  invoke void @callee5(%"struct.DynamicCallback"* nonnull align 8 dereferenceable(8) %state, %10* %11) #31
+          to label %invoke.cont4 unwind label %lpad.i
+
+lpad.i:                                           ; preds = %invoke.cont2
+  %12 = landingpad { i8*, i32 }
+          cleanup
+  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
+  %.phi.trans.insert = bitcast %"struct.DynamicCallback"* %state to i8**
+  %.pre = load i8*, i8** %.phi.trans.insert, align 8
+  br label %lpad1.body
+
+invoke.cont4:                                     ; preds = %invoke.cont2
+  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
+  %13 = load %10*, %10** %call.i.i, align 8
+  store %10* null, %10** %call.i.i, align 8
+  %call78 = call fastcc i8* @callee4(%10* %13) #31 [ "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
+  call void (...) @llvm.objc.clang.arc.noop.use(i8* %call78) #2
+  %14 = bitcast %"struct.DynamicCallback"* %state to i8**
+  %15 = load i8*, i8** %14, align 8
+  call void @llvm.objc.release(i8* %15) #2
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
+  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
+  %16 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %call78) #2
+  ret i8* %call78
+
+lpad:                                             ; preds = %entry
+  %17 = landingpad { i8*, i32 }
+          cleanup
+  br label %ehcleanup
+
+lpad1:                                            ; preds = %invoke.cont
+  %18 = landingpad { i8*, i32 }
+          cleanup
+  br label %lpad1.body
+
+lpad1.body:                                       ; preds = %lpad1, %lpad.i
+  %19 = phi i8* [ %6, %lpad1 ], [ %.pre, %lpad.i ]
+  %eh.lpad-body = phi { i8*, i32 } [ %18, %lpad1 ], [ %12, %lpad.i ]
+  call void @llvm.objc.release(i8* %19) #2
+  br label %ehcleanup
+
+ehcleanup:                                        ; preds = %lpad1.body, %lpad
+  %.pn = phi { i8*, i32 } [ %eh.lpad-body, %lpad1.body ], [ %17, %lpad ]
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
+  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
+  resume { i8*, i32 } %.pn
+}
+declare void @callee1(%"struct.SearchSpec::State"* nonnull align 8 dereferenceable(8), %4*)
+declare void @callee2(%"struct.PointerList"* sret(%"struct.PointerList") align 8, i8*, i1 zeroext)
+declare i8* @callee3(%4* %state.coerce)
+declare i8* @callee4(%10* %state.coerce)
+declare void @callee5(%"struct.DynamicCallback"* nonnull align 8 dereferenceable(8), %10*)
+declare i32 @__gxx_personality_v0(...)
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare void @llvm.objc.clang.arc.noop.use(...)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+
+!1 = !{}

>From 70b35ec0a81375c49482755f08afc9463210ca87 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 3 Nov 2023 11:15:08 -0700
Subject: [PATCH 62/76] [SelectionDAG] Add initial support for nneg flag on
 ISD::ZERO_EXTEND. (#70872)

This adds the nneg flag to SDNodeFlags and the node printing code.
SelectionDAGBuilder will add this flag to the node if the target doesn't
prefer sign extend.

A future RISC-V patch can remove the sign extend preference from
SelectionDAGBuilder.

I've also added the flag to the DAG combine that converts
ISD::SIGN_EXTEND to ISD::ZERO_EXTEND.
---
 llvm/include/llvm/CodeGen/ISDOpcodes.h           |  5 ++++-
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h    | 10 +++++++---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp    |  7 +++++--
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp   |  5 ++++-
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 16 +++++++++-------
 .../CodeGen/SelectionDAG/SelectionDAGDumper.cpp  |  3 +++
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp |  8 +++++++-
 7 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 67779a23a191313..349d1286c8dc4f4 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -773,7 +773,10 @@ enum NodeType {
   /// into new bits.
   SIGN_EXTEND,
 
-  /// ZERO_EXTEND - Used for integer types, zeroing the new bits.
+  /// ZERO_EXTEND - Used for integer types, zeroing the new bits. Can carry
+  /// the NonNeg SDNodeFlag to indicate that the input is known to be
+  /// non-negative. If the flag is present and the input is negative, the result
+  /// is poison.
   ZERO_EXTEND,
 
   /// ANY_EXTEND - Used for integer types.  The high bits are undefined.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 59c6feec8bcbfed..4df56aac4aa17ba 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -381,6 +381,7 @@ struct SDNodeFlags {
   bool NoUnsignedWrap : 1;
   bool NoSignedWrap : 1;
   bool Exact : 1;
+  bool NonNeg : 1;
   bool NoNaNs : 1;
   bool NoInfs : 1;
   bool NoSignedZeros : 1;
@@ -401,9 +402,9 @@ struct SDNodeFlags {
 public:
   /// Default constructor turns off all optimization flags.
   SDNodeFlags()
-      : NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), NoNaNs(false),
-        NoInfs(false), NoSignedZeros(false), AllowReciprocal(false),
-        AllowContract(false), ApproximateFuncs(false),
+      : NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), NonNeg(false),
+        NoNaNs(false), NoInfs(false), NoSignedZeros(false),
+        AllowReciprocal(false), AllowContract(false), ApproximateFuncs(false),
         AllowReassociation(false), NoFPExcept(false), Unpredictable(false) {}
 
   /// Propagate the fast-math-flags from an IR FPMathOperator.
@@ -421,6 +422,7 @@ struct SDNodeFlags {
   void setNoUnsignedWrap(bool b) { NoUnsignedWrap = b; }
   void setNoSignedWrap(bool b) { NoSignedWrap = b; }
   void setExact(bool b) { Exact = b; }
+  void setNonNeg(bool b) { NonNeg = b; }
   void setNoNaNs(bool b) { NoNaNs = b; }
   void setNoInfs(bool b) { NoInfs = b; }
   void setNoSignedZeros(bool b) { NoSignedZeros = b; }
@@ -435,6 +437,7 @@ struct SDNodeFlags {
   bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
   bool hasNoSignedWrap() const { return NoSignedWrap; }
   bool hasExact() const { return Exact; }
+  bool hasNonNeg() const { return NonNeg; }
   bool hasNoNaNs() const { return NoNaNs; }
   bool hasNoInfs() const { return NoInfs; }
   bool hasNoSignedZeros() const { return NoSignedZeros; }
@@ -451,6 +454,7 @@ struct SDNodeFlags {
     NoUnsignedWrap &= Flags.NoUnsignedWrap;
     NoSignedWrap &= Flags.NoSignedWrap;
     Exact &= Flags.Exact;
+    NonNeg &= Flags.Exact;
     NoNaNs &= Flags.NoNaNs;
     NoInfs &= Flags.NoInfs;
     NoSignedZeros &= Flags.NoSignedZeros;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ca5bd4952866886..8c1282274372088 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13484,8 +13484,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   // fold (sext x) -> (zext x) if the sign bit is known zero.
   if (!TLI.isSExtCheaperThanZExt(N0.getValueType(), VT) &&
       (!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
-      DAG.SignBitIsZero(N0))
-    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
+      DAG.SignBitIsZero(N0)) {
+    SDNodeFlags Flags;
+    Flags.setNonNeg(true);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0, Flags);
+  }
 
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4ef5fd87d1340c2..1bcd85417eba260 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5027,7 +5027,6 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BITREVERSE:
   case ISD::PARITY:
   case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
   case ISD::TRUNCATE:
   case ISD::SIGN_EXTEND_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
@@ -5037,6 +5036,10 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BUILD_PAIR:
     return false;
 
+  // Matches hasPoisonGeneratingFlags().
+  case ISD::ZERO_EXTEND:
+    return ConsiderFlags && Op->getFlags().hasNonNeg();
+
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 29505f7505ba25c..48096dc1687fcec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3527,18 +3527,20 @@ void SelectionDAGBuilder::visitZExt(const User &I) {
   auto &TLI = DAG.getTargetLoweringInfo();
   EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
 
-  // Since we don't yet have a representation of zext nneg in SDAG or MI,
-  // eagerly use the information to canonicalize towards sign_extend if
-  // that is the target's preference.  TODO: Add nneg support to the
-  // SDAG and MI representations.
-  if (auto *PNI = dyn_cast<PossiblyNonNegInst>(&I);
-      PNI && PNI->hasNonNeg() &&
+  SDNodeFlags Flags;
+  if (auto *PNI = dyn_cast<PossiblyNonNegInst>(&I))
+    Flags.setNonNeg(PNI->hasNonNeg());
+
+  // Eagerly use nonneg information to canonicalize towards sign_extend if
+  // that is the target's preference.
+  // TODO: Let the target do this later.
+  if (Flags.hasNonNeg() &&
       TLI.isSExtCheaperThanZExt(N.getValueType(), DestVT)) {
     setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
     return;
   }
 
-  setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
+  setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N, Flags));
 }
 
 void SelectionDAGBuilder::visitSExt(const User &I) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a92111ca23656eb..78cc60084068a5f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -597,6 +597,9 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   if (getFlags().hasExact())
     OS << " exact";
 
+  if (getFlags().hasNonNeg())
+    OS << " nneg";
+
   if (getFlags().hasNoNaNs())
     OS << " nnan";
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8aab7b90db5ad16..80f595ac4d4d9c0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2402,11 +2402,17 @@ bool TargetLowering::SimplifyDemandedBits(
         return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
     }
 
+    SDNodeFlags Flags = Op->getFlags();
     APInt InDemandedBits = DemandedBits.trunc(InBits);
     APInt InDemandedElts = DemandedElts.zext(InElts);
     if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
-                             Depth + 1))
+                             Depth + 1)) {
+      if (Flags.hasNonNeg()) {
+        Flags.setNonNeg(false);
+        Op->setFlags(Flags);
+      }
       return true;
+    }
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     assert(Known.getBitWidth() == InBits && "Src width has changed?");
     Known = Known.zext(BitWidth);

>From 5adf6ab7ff9c0bcf32c58224aee6a6a3e901abee Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 3 Nov 2023 11:16:40 -0700
Subject: [PATCH 63/76] Revert "[IndVars] Generate zext nneg when locally
 obvious"

This reverts commit a6c8e27b3a052913a15a13ee0d4ac466c5ab3f92.  It appears likely to have caused https://lab.llvm.org/buildbot/#/builders/57/builds/30988.
---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp  | 19 -------------------
 llvm/test/Analysis/ScalarEvolution/guards.ll  |  2 +-
 .../Transforms/IndVarSimplify/X86/pr59615.ll  |  2 +-
 .../IndVarSimplify/post-inc-range.ll          |  2 +-
 llvm/test/Transforms/LoopFlatten/widen-iv2.ll |  2 +-
 llvm/test/Transforms/LoopFlatten/widen-iv3.ll |  2 +-
 6 files changed, 5 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index a618d72b406b397..f3256985e1185eb 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1201,15 +1201,6 @@ Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
        L = L->getParentLoop())
     Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
 
-  // If we know the operand is never negative, prefer zext nneg.
-  // For constant expressions, fall back to plain sext or zext.
-  if (SE->isKnownNonNegative(SE->getSCEV(NarrowOper))) {
-    auto *Res = Builder.CreateZExt(NarrowOper, WideType);
-    if (auto *I = dyn_cast<Instruction>(Res))
-      I->setNonNeg(true);
-    return Res;
-  }
-
   return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
                     Builder.CreateZExt(NarrowOper, WideType);
 }
@@ -1711,16 +1702,6 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) {
     auto ExtendedOp = [&](Value * V)->Value * {
       if (V == NarrowUse)
         return WideBO;
-
-      // If we know the operand is never negative, prefer zext nneg.
-      // For constant expressions, fall back to plain sext or zext.
-      if (SE->isKnownNonNegative(SE->getSCEV(V))) {
-        auto *Res = Builder.CreateZExt(V, WideBO->getType());
-        if (auto *I = dyn_cast<Instruction>(Res))
-          I->setNonNeg(true);
-        return Res;
-      }
-
       if (ExtKind == ExtendKind::Zero)
         return Builder.CreateZExt(V, WideBO->getType());
       else
diff --git a/llvm/test/Analysis/ScalarEvolution/guards.ll b/llvm/test/Analysis/ScalarEvolution/guards.ll
index 137630cd25e6873..ea17c5840067afb 100644
--- a/llvm/test/Analysis/ScalarEvolution/guards.ll
+++ b/llvm/test/Analysis/ScalarEvolution/guards.ll
@@ -57,7 +57,7 @@ define void @test_2(i32 %n, ptr %len_buf) {
 ; CHECK-SAME: (i32 [[N:%.*]], ptr [[LEN_BUF:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_BUF]], align 4, !range [[RNG1:![0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[LEN]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[N]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr59615.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr59615.ll
index 4fe7f7fd01a0660..17b7b9d40b07a53 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/pr59615.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/pr59615.ll
@@ -17,7 +17,7 @@ define void @test() {
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb8:
 ; CHECK-NEXT:    [[VAR9:%.*]] = load atomic i32, ptr addrspace(1) poison unordered, align 8, !range [[RNG0]], !invariant.load !1, !noundef !1
-; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[VAR9]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[VAR9]] to i64
 ; CHECK-NEXT:    [[VAR10:%.*]] = icmp ult i64 [[INDVARS_IV]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[VAR10]], label [[BB12]], label [[BB11:%.*]]
 ; CHECK:       bb11:
diff --git a/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll b/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll
index 1df0d62168af24e..5c22ba1044b60af 100644
--- a/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll
+++ b/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll
@@ -120,7 +120,7 @@ define void @test_range_metadata(ptr %array_length_ptr, ptr %base,
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH:%.*]] ]
 ; CHECK-NEXT:    [[ARRAY_LENGTH:%.*]] = load i32, ptr [[ARRAY_LENGTH_PTR:%.*]], align 4, !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = zext nneg i32 [[ARRAY_LENGTH]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[ARRAY_LENGTH]] to i64
 ; CHECK-NEXT:    [[WITHIN_LIMITS:%.*]] = icmp ult i64 [[INDVARS_IV]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[WITHIN_LIMITS]], label [[CONTINUE:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       continue:
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
index f4c8b90d4bc27b0..946b98420249e2f 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv2.ll
@@ -39,7 +39,7 @@ define dso_local i32 @fn1() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY3_US]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ]
 ; CHECK-NEXT:    [[J_014_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY3_US]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVAR]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = zext nneg i32 [[J_014_US]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[J_014_US]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw i64 [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[J_014_US]], [[MUL_US]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
index b3a9ac823fd2df7..df8ee6ff0750574 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
@@ -18,7 +18,7 @@ define i16 @foo() {
 ; CHECK-NEXT:    [[SUM_012:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[ADD5_LCSSA:%.*]], [[FOR_COND_CLEANUP3]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul nsw i32 [[INDVAR2]], 16
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i16 [[I_013]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i16 [[MUL]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[MUL]] to i32
 ; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[ADD5_LCSSA_LCSSA:%.*]] = phi i16 [ [[ADD5_LCSSA]], [[FOR_COND_CLEANUP3]] ]

>From f7cd6194a2320429b1569172b868b21947c37efa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 3 Nov 2023 18:03:22 +0000
Subject: [PATCH 64/76] [IR] IntrinsicInst.cpp - use
 StringRef::starts_with/ends_with instead of startswith/endswith. NFC.

startswith/endswith wrap starts_with/ends_with and will eventually go away (to more closely match string_view)

Also add missing assert message
---
 llvm/lib/IR/IntrinsicInst.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 20ae08dd1283000..a24ca8d100527d5 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -235,7 +235,7 @@ void DbgAssignIntrinsic::setValue(Value *V) {
 
 int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
                                                StringRef Name) {
-  assert(Name.startswith("llvm."));
+  assert(Name.starts_with("llvm.") && "Unexpected intrinsic prefix");
 
   // Do successive binary searches of the dotted name components. For
   // "llvm.gc.experimental.statepoint.p1i8.p1i32", we will find the range of
@@ -265,7 +265,7 @@ int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
     return -1;
   StringRef NameFound = *LastLow;
   if (Name == NameFound ||
-      (Name.startswith(NameFound) && Name[NameFound.size()] == '.'))
+      (Name.starts_with(NameFound) && Name[NameFound.size()] == '.'))
     return LastLow - NameTable.begin();
   return -1;
 }

>From bcb685e11945946335c2dc6265779f0226491b49 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 3 Nov 2023 18:14:49 +0000
Subject: [PATCH 65/76] [Support] Use StringRef::starts_with/ends_with instead
 of startswith/endswith. NFC.

startswith/endswith wrap starts_with/ends_with and will eventually go away (to more closely match string_view)
---
 llvm/include/llvm/Support/TypeName.h        |  4 ++--
 llvm/include/llvm/Support/YAMLTraits.h      |  8 ++++----
 llvm/lib/Support/APFloat.cpp                |  2 +-
 llvm/lib/Support/CachePruning.cpp           |  2 +-
 llvm/lib/Support/DebugCounter.cpp           |  4 ++--
 llvm/lib/Support/ELFAttributes.cpp          |  2 +-
 llvm/lib/Support/Path.cpp                   | 10 +++++-----
 llvm/lib/Support/RISCVISAInfo.cpp           | 20 ++++++++++----------
 llvm/lib/Support/Signals.cpp                |  4 ++--
 llvm/lib/Support/SpecialCaseList.cpp        |  6 +++---
 llvm/lib/Support/StringRef.cpp              |  6 +++---
 llvm/lib/Support/UnicodeNameToCodepoint.cpp |  2 +-
 llvm/lib/Support/VirtualFileSystem.cpp      |  2 +-
 llvm/lib/Support/Windows/Path.inc           |  6 +++---
 llvm/lib/Support/Windows/Process.inc        |  2 +-
 llvm/lib/Support/YAMLParser.cpp             |  2 +-
 16 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/llvm/include/llvm/Support/TypeName.h b/llvm/include/llvm/Support/TypeName.h
index 95f20f7dfde7076..9547e76a7fa79bc 100644
--- a/llvm/include/llvm/Support/TypeName.h
+++ b/llvm/include/llvm/Support/TypeName.h
@@ -33,7 +33,7 @@ inline StringRef getTypeName() {
   assert(!Name.empty() && "Unable to find the template parameter!");
   Name = Name.drop_front(Key.size());
 
-  assert(Name.endswith("]") && "Name doesn't end in the substitution key!");
+  assert(Name.ends_with("]") && "Name doesn't end in the substitution key!");
   return Name.drop_back(1);
 #elif defined(_MSC_VER)
   StringRef Name = __FUNCSIG__;
@@ -44,7 +44,7 @@ inline StringRef getTypeName() {
   Name = Name.drop_front(Key.size());
 
   for (StringRef Prefix : {"class ", "struct ", "union ", "enum "})
-    if (Name.startswith(Prefix)) {
+    if (Name.starts_with(Prefix)) {
       Name = Name.drop_front(Prefix.size());
       break;
     }
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 45bb3034098449f..99074105a556989 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -584,11 +584,11 @@ inline bool isNumeric(StringRef S) {
   // Section 10.3.2 Tag Resolution
   // YAML 1.2 Specification prohibits Base 8 and Base 16 numbers prefixed with
   // [-+], so S should be used instead of Tail.
-  if (S.startswith("0o"))
+  if (S.starts_with("0o"))
     return S.size() > 2 &&
            S.drop_front(2).find_first_not_of("01234567") == StringRef::npos;
 
-  if (S.startswith("0x"))
+  if (S.starts_with("0x"))
     return S.size() > 2 && S.drop_front(2).find_first_not_of(
                                "0123456789abcdefABCDEF") == StringRef::npos;
 
@@ -598,12 +598,12 @@ inline bool isNumeric(StringRef S) {
   // Handle cases when the number starts with '.' and hence needs at least one
   // digit after dot (as opposed by number which has digits before the dot), but
   // doesn't have one.
-  if (S.startswith(".") &&
+  if (S.starts_with(".") &&
       (S.equals(".") ||
        (S.size() > 1 && std::strchr("0123456789", S[1]) == nullptr)))
     return false;
 
-  if (S.startswith("E") || S.startswith("e"))
+  if (S.starts_with("E") || S.starts_with("e"))
     return false;
 
   enum ParseState {
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 77ba7a0a0215379..0a4f5ac01553f10 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -3148,7 +3148,7 @@ bool IEEEFloat::convertFromStringSpecials(StringRef str) {
       return false;
   }
 
-  if (str.startswith("nan") || str.startswith("NaN")) {
+  if (str.starts_with("nan") || str.starts_with("NaN")) {
     str = str.drop_front(3);
 
     // A NaN without payload.
diff --git a/llvm/lib/Support/CachePruning.cpp b/llvm/lib/Support/CachePruning.cpp
index a56d8356d8383ca..4eae08b18c9b53d 100644
--- a/llvm/lib/Support/CachePruning.cpp
+++ b/llvm/lib/Support/CachePruning.cpp
@@ -218,7 +218,7 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy,
     // This acts as a safeguard against data loss if the user specifies the
     // wrong directory as their cache directory.
     StringRef filename = sys::path::filename(File->path());
-    if (!filename.startswith("llvmcache-") && !filename.startswith("Thin-"))
+    if (!filename.starts_with("llvmcache-") && !filename.starts_with("Thin-"))
       continue;
 
     // Look at this file. If we can't stat it, there's nothing interesting
diff --git a/llvm/lib/Support/DebugCounter.cpp b/llvm/lib/Support/DebugCounter.cpp
index 26293bf92a42e9f..502665d2a8348ea 100644
--- a/llvm/lib/Support/DebugCounter.cpp
+++ b/llvm/lib/Support/DebugCounter.cpp
@@ -100,7 +100,7 @@ void DebugCounter::push_back(const std::string &Val) {
   }
   // Now we need to see if this is the skip or the count, remove the suffix, and
   // add it to the counter values.
-  if (CounterPair.first.endswith("-skip")) {
+  if (CounterPair.first.ends_with("-skip")) {
     auto CounterName = CounterPair.first.drop_back(5);
     unsigned CounterID = getCounterId(std::string(CounterName));
     if (!CounterID) {
@@ -113,7 +113,7 @@ void DebugCounter::push_back(const std::string &Val) {
     CounterInfo &Counter = Counters[CounterID];
     Counter.Skip = CounterVal;
     Counter.IsSet = true;
-  } else if (CounterPair.first.endswith("-count")) {
+  } else if (CounterPair.first.ends_with("-count")) {
     auto CounterName = CounterPair.first.drop_back(6);
     unsigned CounterID = getCounterId(std::string(CounterName));
     if (!CounterID) {
diff --git a/llvm/lib/Support/ELFAttributes.cpp b/llvm/lib/Support/ELFAttributes.cpp
index f2a1732fc3761f7..63d14486444660b 100644
--- a/llvm/lib/Support/ELFAttributes.cpp
+++ b/llvm/lib/Support/ELFAttributes.cpp
@@ -23,7 +23,7 @@ StringRef ELFAttrs::attrTypeAsString(unsigned attr, TagNameMap tagNameMap,
 
 std::optional<unsigned> ELFAttrs::attrTypeFromString(StringRef tag,
                                                      TagNameMap tagNameMap) {
-  bool hasTagPrefix = tag.startswith("Tag_");
+  bool hasTagPrefix = tag.starts_with("Tag_");
   auto tagNameIt =
       find_if(tagNameMap, [tag, hasTagPrefix](const TagNameItem item) {
         return item.tagName.drop_front(hasTagPrefix ? 0 : 4) == tag;
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 7a57c104ef10e7b..c2456dcac0974a6 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -263,7 +263,7 @@ const_iterator &const_iterator::operator++() {
     // Root dir.
     if (was_net ||
         // c:/
-        (is_style_windows(S) && Component.endswith(":"))) {
+        (is_style_windows(S) && Component.ends_with(":"))) {
       Component = Path.substr(Position, 1);
       return *this;
     }
@@ -352,7 +352,7 @@ StringRef root_path(StringRef path, Style style) {
   if (b != e) {
     bool has_net =
         b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
-    bool has_drive = is_style_windows(style) && b->endswith(":");
+    bool has_drive = is_style_windows(style) && b->ends_with(":");
 
     if (has_net || has_drive) {
       if ((++pos != e) && is_separator((*pos)[0], style)) {
@@ -377,7 +377,7 @@ StringRef root_name(StringRef path, Style style) {
   if (b != e) {
     bool has_net =
         b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
-    bool has_drive = is_style_windows(style) && b->endswith(":");
+    bool has_drive = is_style_windows(style) && b->ends_with(":");
 
     if (has_net || has_drive) {
       // just {C:,//net}, return the first component.
@@ -394,7 +394,7 @@ StringRef root_directory(StringRef path, Style style) {
   if (b != e) {
     bool has_net =
         b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
-    bool has_drive = is_style_windows(style) && b->endswith(":");
+    bool has_drive = is_style_windows(style) && b->ends_with(":");
 
     if ((has_net || has_drive) &&
         // {C:,//net}, skip to the next component.
@@ -514,7 +514,7 @@ static bool starts_with(StringRef Path, StringRef Prefix,
     }
     return true;
   }
-  return Path.startswith(Prefix);
+  return Path.starts_with(Prefix);
 }
 
 bool replace_path_prefix(SmallVectorImpl<char> &Path, StringRef OldPrefix,
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 158ad6fe1d9ca5d..2e9ce9231e83631 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -311,21 +311,21 @@ void RISCVISAInfo::addExtension(StringRef ExtName, unsigned MajorVersion,
 }
 
 static StringRef getExtensionTypeDesc(StringRef Ext) {
-  if (Ext.startswith("s"))
+  if (Ext.starts_with("s"))
     return "standard supervisor-level extension";
-  if (Ext.startswith("x"))
+  if (Ext.starts_with("x"))
     return "non-standard user-level extension";
-  if (Ext.startswith("z"))
+  if (Ext.starts_with("z"))
     return "standard user-level extension";
   return StringRef();
 }
 
 static StringRef getExtensionType(StringRef Ext) {
-  if (Ext.startswith("s"))
+  if (Ext.starts_with("s"))
     return "s";
-  if (Ext.startswith("x"))
+  if (Ext.starts_with("x"))
     return "x";
-  if (Ext.startswith("z"))
+  if (Ext.starts_with("z"))
     return "z";
   return StringRef();
 }
@@ -641,9 +641,9 @@ RISCVISAInfo::parseNormalizedArchString(StringRef Arch) {
   }
   // Must start with a valid base ISA name.
   unsigned XLen;
-  if (Arch.startswith("rv32i") || Arch.startswith("rv32e"))
+  if (Arch.starts_with("rv32i") || Arch.starts_with("rv32e"))
     XLen = 32;
-  else if (Arch.startswith("rv64i") || Arch.startswith("rv64e"))
+  else if (Arch.starts_with("rv64i") || Arch.starts_with("rv64e"))
     XLen = 64;
   else
     return createStringError(errc::invalid_argument,
@@ -704,9 +704,9 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
                              "string must be lowercase");
   }
 
-  bool HasRV64 = Arch.startswith("rv64");
+  bool HasRV64 = Arch.starts_with("rv64");
   // ISA string must begin with rv32 or rv64.
-  if (!(Arch.startswith("rv32") || HasRV64) || (Arch.size() < 5)) {
+  if (!(Arch.starts_with("rv32") || HasRV64) || (Arch.size() < 5)) {
     return createStringError(
         errc::invalid_argument,
         "string must begin with rv32{i,e,g} or rv64{i,e,g}");
diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp
index e6f74b801c52b48..669a9e2a8396521 100644
--- a/llvm/lib/Support/Signals.cpp
+++ b/llvm/lib/Support/Signals.cpp
@@ -238,12 +238,12 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
       if (FunctionName.empty())
         break;
       PrintLineHeader();
-      if (!FunctionName.startswith("??"))
+      if (!FunctionName.starts_with("??"))
         OS << FunctionName << ' ';
       if (CurLine == Lines.end())
         return false;
       StringRef FileLineInfo = *CurLine++;
-      if (!FileLineInfo.startswith("??"))
+      if (!FileLineInfo.starts_with("??"))
         OS << FileLineInfo;
       else
         OS << "(" << Modules[i] << '+' << format_hex(Offsets[i], 0) << ")";
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index ac693eca44be8b4..ac8877cca8bc66c 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -156,7 +156,7 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
   // regexes. If "#!special-case-list-v2" is the first line of the file, then
   // we will use the new behavior using globs. For more details, see
   // https://discourse.llvm.org/t/use-glob-instead-of-regex-for-specialcaselists/71666
-  bool UseGlobs = MB->getBuffer().startswith("#!special-case-list-v2\n");
+  bool UseGlobs = MB->getBuffer().starts_with("#!special-case-list-v2\n");
 
   for (line_iterator LineIt(*MB, /*SkipBlanks=*/true, /*CommentMarker=*/'#');
        !LineIt.is_at_eof(); LineIt++) {
@@ -166,8 +166,8 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
       continue;
 
     // Save section names
-    if (Line.startswith("[")) {
-      if (!Line.endswith("]")) {
+    if (Line.starts_with("[")) {
+      if (!Line.ends_with("]")) {
         Error =
             ("malformed section header on line " + Twine(LineNo) + ": " + Line)
                 .str();
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index 3cce83a982c4ca3..feee47ca693b251 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -388,17 +388,17 @@ static unsigned GetAutoSenseRadix(StringRef &Str) {
   if (Str.empty())
     return 10;
 
-  if (Str.startswith("0x") || Str.startswith("0X")) {
+  if (Str.starts_with("0x") || Str.starts_with("0X")) {
     Str = Str.substr(2);
     return 16;
   }
 
-  if (Str.startswith("0b") || Str.startswith("0B")) {
+  if (Str.starts_with("0b") || Str.starts_with("0B")) {
     Str = Str.substr(2);
     return 2;
   }
 
-  if (Str.startswith("0o")) {
+  if (Str.starts_with("0o")) {
     Str = Str.substr(2);
     return 8;
   }
diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
index a09b3ffc4cdcfb8..40592660acaaed2 100644
--- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
@@ -123,7 +123,7 @@ static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
 
   Consummed = 0;
   if (Strict) {
-    if (!Name.startswith(Needle))
+    if (!Name.starts_with(Needle))
       return false;
     Consummed = Needle.size();
     return true;
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 1f8563ab82f4d02..367e794d38f63ac 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -1385,7 +1385,7 @@ RedirectingFileSystem::makeAbsolute(StringRef WorkingDir,
 
   std::string Result = std::string(WorkingDir);
   StringRef Dir(Result);
-  if (!Dir.endswith(sys::path::get_separator(style))) {
+  if (!Dir.ends_with(sys::path::get_separator(style))) {
     Result += sys::path::get_separator(style);
   }
   // backslashes '\' are legit path charactors under POSIX. Windows APIs
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 744b6ff80d00384..e4563fd6ed9ef78 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -666,7 +666,7 @@ static bool isReservedName(StringRef path) {
 
   // First, check to see if this is a device namespace, which always
   // starts with \\.\, since device namespaces are not legal file paths.
-  if (path.startswith("\\\\.\\"))
+  if (path.starts_with("\\\\.\\"))
     return true;
 
   // Then compare against the list of ancient reserved names.
@@ -940,10 +940,10 @@ static bool hasFlushBufferKernelBug() {
 
 static bool isEXE(StringRef Magic) {
   static const char PEMagic[] = {'P', 'E', '\0', '\0'};
-  if (Magic.startswith(StringRef("MZ")) && Magic.size() >= 0x3c + 4) {
+  if (Magic.starts_with(StringRef("MZ")) && Magic.size() >= 0x3c + 4) {
     uint32_t off = read32le(Magic.data() + 0x3c);
     // PE/COFF file, either EXE or DLL.
-    if (Magic.substr(off).startswith(StringRef(PEMagic, sizeof(PEMagic))))
+    if (Magic.substr(off).starts_with(StringRef(PEMagic, sizeof(PEMagic))))
       return true;
   }
   return false;
diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index 493209052a1c547..a54c06d46870b58 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -158,7 +158,7 @@ static std::error_code WildcardExpand(StringRef Arg,
   // option. Paths that start with \\?\ are absolute paths, and aren't
   // expected to be used with wildcard expressions.
   if (Arg.find_first_of("*?") == StringRef::npos || Arg == "/?" ||
-      Arg == "-?" || Arg.startswith("\\\\?\\")) {
+      Arg == "-?" || Arg.starts_with("\\\\?\\")) {
     Args.push_back(Arg.data());
     return EC;
   }
diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp
index 1422e40f91944ae..17d727b6cc07da8 100644
--- a/llvm/lib/Support/YAMLParser.cpp
+++ b/llvm/lib/Support/YAMLParser.cpp
@@ -1968,7 +1968,7 @@ std::string Node::getVerbatimTag() const {
       Ret = std::string(Doc->getTagMap().find("!")->second);
       Ret += Raw.substr(1);
       return Ret;
-    } else if (Raw.startswith("!!")) {
+    } else if (Raw.starts_with("!!")) {
       Ret = std::string(Doc->getTagMap().find("!!")->second);
       Ret += Raw.substr(2);
       return Ret;

>From 5192e299cf444040025ccf3e75bfad36b4624050 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental at gmail.com>
Date: Fri, 3 Nov 2023 13:28:20 -0500
Subject: [PATCH 66/76] [mlir][python] remove various caching mechanisms
 (#70831)

This PR removes the various caching mechanisms currently in the python
bindings - both positive caching and negative caching.
---
 mlir/docs/Bindings/Python.md                  |   5 +-
 mlir/lib/Bindings/Python/Globals.h            |  24 +---
 mlir/lib/Bindings/Python/IRModule.cpp         | 131 +++++-------------
 mlir/lib/Bindings/Python/MainModule.cpp       |  11 +-
 .../python/mlir/_mlir_libs/_mlir/__init__.pyi |   1 +
 .../test/python/ir/custom_dialect/__init__.py |   0
 mlir/test/python/ir/custom_dialect/custom.py  |   4 +
 .../python/ir/custom_dialect/lit.local.cfg    |   2 +
 mlir/test/python/ir/dialects.py               |  17 +++
 mlir/test/python/ir/insertion_point.py        |   2 -
 10 files changed, 80 insertions(+), 117 deletions(-)
 create mode 100644 mlir/test/python/ir/custom_dialect/__init__.py
 create mode 100644 mlir/test/python/ir/custom_dialect/custom.py
 create mode 100644 mlir/test/python/ir/custom_dialect/lit.local.cfg

diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md
index bc2e676a878c0f4..6e52c4deaad9aa9 100644
--- a/mlir/docs/Bindings/Python.md
+++ b/mlir/docs/Bindings/Python.md
@@ -945,10 +945,11 @@ When the python bindings need to locate a wrapper module, they consult the
 `dialect_search_path` and use it to find an appropriately named module. For the
 main repository, this search path is hard-coded to include the `mlir.dialects`
 module, which is where wrappers are emitted by the above build rule. Out of tree
-dialects and add their modules to the search path by calling:
+dialects can add their modules to the search path by calling:
 
 ```python
-mlir._cext.append_dialect_search_prefix("myproject.mlir.dialects")
+from mlir.dialects._ods_common import _cext
+_cext.globals.append_dialect_search_prefix("myproject.mlir.dialects")
 ```
 
 ### Wrapper module code organization
diff --git a/mlir/lib/Bindings/Python/Globals.h b/mlir/lib/Bindings/Python/Globals.h
index 21899bdce22e810..976297257ced06e 100644
--- a/mlir/lib/Bindings/Python/Globals.h
+++ b/mlir/lib/Bindings/Python/Globals.h
@@ -9,10 +9,6 @@
 #ifndef MLIR_BINDINGS_PYTHON_GLOBALS_H
 #define MLIR_BINDINGS_PYTHON_GLOBALS_H
 
-#include <optional>
-#include <string>
-#include <vector>
-
 #include "PybindUtils.h"
 
 #include "mlir-c/IR.h"
@@ -21,6 +17,10 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 
+#include <optional>
+#include <string>
+#include <vector>
+
 namespace mlir {
 namespace python {
 
@@ -45,17 +45,13 @@ class PyGlobals {
     dialectSearchPrefixes.swap(newValues);
   }
 
-  /// Clears positive and negative caches regarding what implementations are
-  /// available. Future lookups will do more expensive existence checks.
-  void clearImportCache();
-
   /// Loads a python module corresponding to the given dialect namespace.
   /// No-ops if the module has already been loaded or is not found. Raises
   /// an error on any evaluation issues.
   /// Note that this returns void because it is expected that the module
   /// contains calls to decorators and helpers that register the salient
-  /// entities.
-  void loadDialectModule(llvm::StringRef dialectNamespace);
+  /// entities. Returns true if dialect is successfully loaded.
+  bool loadDialectModule(llvm::StringRef dialectNamespace);
 
   /// Adds a user-friendly Attribute builder.
   /// Raises an exception if the mapping already exists and replace == false.
@@ -113,16 +109,10 @@ class PyGlobals {
   llvm::StringMap<pybind11::object> attributeBuilderMap;
   /// Map of MlirTypeID to custom type caster.
   llvm::DenseMap<MlirTypeID, pybind11::object> typeCasterMap;
-  /// Cache for map of MlirTypeID to custom type caster.
-  llvm::DenseMap<MlirTypeID, pybind11::object> typeCasterMapCache;
 
   /// Set of dialect namespaces that we have attempted to import implementation
   /// modules for.
-  llvm::StringSet<> loadedDialectModulesCache;
-  /// Cache of operation name to external operation class object. This is
-  /// maintained on lookup as a shadow of operationClassMap in order for repeat
-  /// lookups of the classes to only incur the cost of one hashtable lookup.
-  llvm::StringMap<pybind11::object> operationClassMapCache;
+  llvm::StringSet<> loadedDialectModules;
 };
 
 } // namespace python
diff --git a/mlir/lib/Bindings/Python/IRModule.cpp b/mlir/lib/Bindings/Python/IRModule.cpp
index f8e22f7bb0c1ba7..6c5cde86236ce90 100644
--- a/mlir/lib/Bindings/Python/IRModule.cpp
+++ b/mlir/lib/Bindings/Python/IRModule.cpp
@@ -10,12 +10,12 @@
 #include "Globals.h"
 #include "PybindUtils.h"
 
-#include <optional>
-#include <vector>
-
 #include "mlir-c/Bindings/Python/Interop.h"
 #include "mlir-c/Support.h"
 
+#include <optional>
+#include <vector>
+
 namespace py = pybind11;
 using namespace mlir;
 using namespace mlir::python;
@@ -36,12 +36,12 @@ PyGlobals::PyGlobals() {
 
 PyGlobals::~PyGlobals() { instance = nullptr; }
 
-void PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) {
-  if (loadedDialectModulesCache.contains(dialectNamespace))
-    return;
+bool PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) {
+  if (loadedDialectModules.contains(dialectNamespace))
+    return true;
   // Since re-entrancy is possible, make a copy of the search prefixes.
   std::vector<std::string> localSearchPrefixes = dialectSearchPrefixes;
-  py::object loaded;
+  py::object loaded = py::none();
   for (std::string moduleName : localSearchPrefixes) {
     moduleName.push_back('.');
     moduleName.append(dialectNamespace.data(), dialectNamespace.size());
@@ -57,15 +57,18 @@ void PyGlobals::loadDialectModule(llvm::StringRef dialectNamespace) {
     break;
   }
 
+  if (loaded.is_none())
+    return false;
   // Note: Iterator cannot be shared from prior to loading, since re-entrancy
   // may have occurred, which may do anything.
-  loadedDialectModulesCache.insert(dialectNamespace);
+  loadedDialectModules.insert(dialectNamespace);
+  return true;
 }
 
 void PyGlobals::registerAttributeBuilder(const std::string &attributeKind,
                                          py::function pyFunc, bool replace) {
   py::object &found = attributeBuilderMap[attributeKind];
-  if (found && !found.is_none() && !replace) {
+  if (found && !replace) {
     throw std::runtime_error((llvm::Twine("Attribute builder for '") +
                               attributeKind +
                               "' is already registered with func: " +
@@ -79,13 +82,10 @@ void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID,
                                    pybind11::function typeCaster,
                                    bool replace) {
   pybind11::object &found = typeCasterMap[mlirTypeID];
-  if (found && !found.is_none() && !replace)
-    throw std::runtime_error("Type caster is already registered");
+  if (found && !replace)
+    throw std::runtime_error("Type caster is already registered with caster: " +
+                             py::str(found).operator std::string());
   found = std::move(typeCaster);
-  const auto foundIt = typeCasterMapCache.find(mlirTypeID);
-  if (foundIt != typeCasterMapCache.end() && !foundIt->second.is_none()) {
-    typeCasterMapCache[mlirTypeID] = found;
-  }
 }
 
 void PyGlobals::registerDialectImpl(const std::string &dialectNamespace,
@@ -108,114 +108,59 @@ void PyGlobals::registerOperationImpl(const std::string &operationName,
                                  .str());
   }
   found = std::move(pyClass);
-  auto foundIt = operationClassMapCache.find(operationName);
-  if (foundIt != operationClassMapCache.end() && !foundIt->second.is_none()) {
-    operationClassMapCache[operationName] = found;
-  }
 }
 
 std::optional<py::function>
 PyGlobals::lookupAttributeBuilder(const std::string &attributeKind) {
-  // Fast match against the class map first (common case).
   const auto foundIt = attributeBuilderMap.find(attributeKind);
   if (foundIt != attributeBuilderMap.end()) {
-    if (foundIt->second.is_none())
-      return std::nullopt;
-    assert(foundIt->second && "py::function is defined");
+    assert(foundIt->second && "attribute builder is defined");
     return foundIt->second;
   }
-
-  // Not found and loading did not yield a registration. Negative cache.
-  attributeBuilderMap[attributeKind] = py::none();
   return std::nullopt;
 }
 
 std::optional<py::function> PyGlobals::lookupTypeCaster(MlirTypeID mlirTypeID,
                                                         MlirDialect dialect) {
-  {
-    // Fast match against the class map first (common case).
-    const auto foundIt = typeCasterMapCache.find(mlirTypeID);
-    if (foundIt != typeCasterMapCache.end()) {
-      if (foundIt->second.is_none())
-        return std::nullopt;
-      assert(foundIt->second && "py::function is defined");
-      return foundIt->second;
-    }
-  }
-
-  // Not found. Load the dialect namespace.
-  loadDialectModule(unwrap(mlirDialectGetNamespace(dialect)));
-
-  // Attempt to find from the canonical map and cache.
-  {
-    const auto foundIt = typeCasterMap.find(mlirTypeID);
-    if (foundIt != typeCasterMap.end()) {
-      if (foundIt->second.is_none())
-        return std::nullopt;
-      assert(foundIt->second && "py::object is defined");
-      // Positive cache.
-      typeCasterMapCache[mlirTypeID] = foundIt->second;
-      return foundIt->second;
-    }
-    // Negative cache.
-    typeCasterMap[mlirTypeID] = py::none();
+  // Make sure dialect module is loaded.
+  if (!loadDialectModule(unwrap(mlirDialectGetNamespace(dialect))))
     return std::nullopt;
+
+  const auto foundIt = typeCasterMap.find(mlirTypeID);
+  if (foundIt != typeCasterMap.end()) {
+    assert(foundIt->second && "type caster is defined");
+    return foundIt->second;
   }
+  return std::nullopt;
 }
 
 std::optional<py::object>
 PyGlobals::lookupDialectClass(const std::string &dialectNamespace) {
-  loadDialectModule(dialectNamespace);
-  // Fast match against the class map first (common case).
+  // Make sure dialect module is loaded.
+  if (!loadDialectModule(dialectNamespace))
+    return std::nullopt;
   const auto foundIt = dialectClassMap.find(dialectNamespace);
   if (foundIt != dialectClassMap.end()) {
-    if (foundIt->second.is_none())
-      return std::nullopt;
-    assert(foundIt->second && "py::object is defined");
+    assert(foundIt->second && "dialect class is defined");
     return foundIt->second;
   }
-
-  // Not found and loading did not yield a registration. Negative cache.
-  dialectClassMap[dialectNamespace] = py::none();
+  // Not found and loading did not yield a registration.
   return std::nullopt;
 }
 
 std::optional<pybind11::object>
 PyGlobals::lookupOperationClass(llvm::StringRef operationName) {
-  {
-    auto foundIt = operationClassMapCache.find(operationName);
-    if (foundIt != operationClassMapCache.end()) {
-      if (foundIt->second.is_none())
-        return std::nullopt;
-      assert(foundIt->second && "py::object is defined");
-      return foundIt->second;
-    }
-  }
-
-  // Not found. Load the dialect namespace.
+  // Make sure dialect module is loaded.
   auto split = operationName.split('.');
   llvm::StringRef dialectNamespace = split.first;
-  loadDialectModule(dialectNamespace);
-
-  // Attempt to find from the canonical map and cache.
-  {
-    auto foundIt = operationClassMap.find(operationName);
-    if (foundIt != operationClassMap.end()) {
-      if (foundIt->second.is_none())
-        return std::nullopt;
-      assert(foundIt->second && "py::object is defined");
-      // Positive cache.
-      operationClassMapCache[operationName] = foundIt->second;
-      return foundIt->second;
-    }
-    // Negative cache.
-    operationClassMap[operationName] = py::none();
+  if (!loadDialectModule(dialectNamespace))
     return std::nullopt;
-  }
-}
 
-void PyGlobals::clearImportCache() {
-  loadedDialectModulesCache.clear();
-  operationClassMapCache.clear();
-  typeCasterMapCache.clear();
+  auto foundIt = operationClassMap.find(operationName);
+  if (foundIt != operationClassMap.end()) {
+    assert(foundIt->second && "OpView is defined");
+    return foundIt->second;
+  }
+  // Not found and loading did not yield a registration.
+  return std::nullopt;
 }
diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp
index a936becf67bea75..2ba3a3677198cbc 100644
--- a/mlir/lib/Bindings/Python/MainModule.cpp
+++ b/mlir/lib/Bindings/Python/MainModule.cpp
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <tuple>
-
 #include "PybindUtils.h"
 
 #include "Globals.h"
 #include "IRModule.h"
 #include "Pass.h"
 
+#include <tuple>
+
 namespace py = pybind11;
 using namespace mlir;
 using namespace py::literals;
@@ -34,9 +34,14 @@ PYBIND11_MODULE(_mlir, m) {
           "append_dialect_search_prefix",
           [](PyGlobals &self, std::string moduleName) {
             self.getDialectSearchPrefixes().push_back(std::move(moduleName));
-            self.clearImportCache();
           },
           "module_name"_a)
+      .def(
+          "_check_dialect_module_loaded",
+          [](PyGlobals &self, const std::string &dialectNamespace) {
+            return self.loadDialectModule(dialectNamespace);
+          },
+          "dialect_namespace"_a)
       .def("_register_dialect_impl", &PyGlobals::registerDialectImpl,
            "dialect_namespace"_a, "dialect_class"_a,
            "Testing hook for directly registering a dialect")
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi b/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi
index 93b98c4aa53fbd8..3ed1872f1cd5a21 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi
@@ -7,6 +7,7 @@ class _Globals:
     def _register_dialect_impl(self, dialect_namespace: str, dialect_class: type) -> None: ...
     def _register_operation_impl(self, operation_name: str, operation_class: type) -> None: ...
     def append_dialect_search_prefix(self, module_name: str) -> None: ...
+    def _check_dialect_module_loaded(self, dialect_namespace: str) -> bool: ...
 
 def register_dialect(dialect_class: type) -> object: ...
 def register_operation(dialect_class: type) -> object: ...
diff --git a/mlir/test/python/ir/custom_dialect/__init__.py b/mlir/test/python/ir/custom_dialect/__init__.py
new file mode 100644
index 000000000000000..e69de29bb2d1d64
diff --git a/mlir/test/python/ir/custom_dialect/custom.py b/mlir/test/python/ir/custom_dialect/custom.py
new file mode 100644
index 000000000000000..388368ca6fe6bcd
--- /dev/null
+++ b/mlir/test/python/ir/custom_dialect/custom.py
@@ -0,0 +1,4 @@
+# The purpose of this empty dialect module is to enable successfully loading the "custom" dialect.
+# Without this file here (and a corresponding _cext.globals.append_dialect_search_prefix("custom_dialect")),
+# PyGlobals::loadDialectModule would search and fail to find the "custom" dialect for each Operation.create("custom.op")
+# (amongst other things).
diff --git a/mlir/test/python/ir/custom_dialect/lit.local.cfg b/mlir/test/python/ir/custom_dialect/lit.local.cfg
new file mode 100644
index 000000000000000..26ea63660d6a3bf
--- /dev/null
+++ b/mlir/test/python/ir/custom_dialect/lit.local.cfg
@@ -0,0 +1,2 @@
+config.excludes.add("__init__.py")
+config.excludes.add("custom.py")
diff --git a/mlir/test/python/ir/dialects.py b/mlir/test/python/ir/dialects.py
index eebf7c3e48989ff..d59c6a6bc424e68 100644
--- a/mlir/test/python/ir/dialects.py
+++ b/mlir/test/python/ir/dialects.py
@@ -1,7 +1,9 @@
 # RUN: %PYTHON %s | FileCheck %s
 
 import gc
+import sys
 from mlir.ir import *
+from mlir.dialects._ods_common import _cext
 
 
 def run(f):
@@ -104,3 +106,18 @@ def testIsRegisteredOperation():
     print(f"cf.cond_br: {ctx.is_registered_operation('cf.cond_br')}")
     # CHECK: func.not_existing: False
     print(f"func.not_existing: {ctx.is_registered_operation('func.not_existing')}")
+
+
+# CHECK-LABEL: TEST: testAppendPrefixSearchPath
+ at run
+def testAppendPrefixSearchPath():
+    ctx = Context()
+    ctx.allow_unregistered_dialects = True
+    with Location.unknown(ctx):
+        assert not _cext.globals._check_dialect_module_loaded("custom")
+        Operation.create("custom.op")
+        assert not _cext.globals._check_dialect_module_loaded("custom")
+
+        sys.path.append(".")
+        _cext.globals.append_dialect_search_prefix("custom_dialect")
+        assert _cext.globals._check_dialect_module_loaded("custom")
diff --git a/mlir/test/python/ir/insertion_point.py b/mlir/test/python/ir/insertion_point.py
index 268d2e77d036f5e..5eb861a2c089191 100644
--- a/mlir/test/python/ir/insertion_point.py
+++ b/mlir/test/python/ir/insertion_point.py
@@ -1,8 +1,6 @@
 # RUN: %PYTHON %s | FileCheck %s
 
 import gc
-import io
-import itertools
 from mlir.ir import *
 
 

>From 621d95a8765ad044a27750477636248397963518 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad at comstyle.com>
Date: Fri, 3 Nov 2023 15:23:11 -0400
Subject: [PATCH 67/76] [ADT] Make use of the endian.h header on OpenBSD
 (#71144)

OpenBSD prefers the use of the endian.h header.
---
 llvm/include/llvm/ADT/bit.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h
index 0a228b2204d6fc2..12223facbb30500 100644
--- a/llvm/include/llvm/ADT/bit.h
+++ b/llvm/include/llvm/ADT/bit.h
@@ -28,7 +28,7 @@
 #endif
 
 #if defined(__linux__) || defined(__GNU__) || defined(__HAIKU__) ||            \
-    defined(__Fuchsia__) || defined(__EMSCRIPTEN__)
+    defined(__Fuchsia__) || defined(__EMSCRIPTEN__) || defined(__OpenBSD__)
 #include <endian.h>
 #elif defined(_AIX)
 #include <sys/machine.h>

>From c880fdc0f05f4a18400c5633cf6cd8fad85e3df0 Mon Sep 17 00:00:00 2001
From: XChy <xxs_chy at outlook.com>
Date: Sat, 4 Nov 2023 03:32:20 +0800
Subject: [PATCH 68/76] [DFAJumpThreading] Remove incoming StartBlock from all
 phis when unfolding select (#71082)

Fixes #65222.
When unfolding select into diamond-like control flow, we need to remove
the StartBlock from all phis in EndBlock.
---
 .../Transforms/Scalar/DFAJumpThreading.cpp    | 10 ++-
 .../DFAJumpThreading/dfa-unfold-select.ll     | 65 +++++++++++++++++--
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index c06564d45b06430..edfeb36f3422e2e 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -249,16 +249,20 @@ void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
     FT = FalseBlock;
 
     // Update the phi node of SI.
-    SIUse->removeIncomingValue(StartBlock, /* DeletePHIIfEmpty = */ false);
     SIUse->addIncoming(SI->getTrueValue(), TrueBlock);
     SIUse->addIncoming(SI->getFalseValue(), FalseBlock);
 
     // Update any other PHI nodes in EndBlock.
     for (PHINode &Phi : EndBlock->phis()) {
       if (&Phi != SIUse) {
-        Phi.addIncoming(Phi.getIncomingValueForBlock(StartBlock), TrueBlock);
-        Phi.addIncoming(Phi.getIncomingValueForBlock(StartBlock), FalseBlock);
+        Value *OrigValue = Phi.getIncomingValueForBlock(StartBlock);
+        Phi.addIncoming(OrigValue, TrueBlock);
+        Phi.addIncoming(OrigValue, FalseBlock);
       }
+
+      // Remove incoming place of original StartBlock, which comes in a indirect
+      // way (through TrueBlock and FalseBlock) now.
+      Phi.removeIncomingValue(StartBlock, /* DeletePHIIfEmpty = */ false);
     }
   } else {
     BasicBlock *NewBlock = nullptr;
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
index c7bce505b717892..df725b9a7fa47d1 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -15,8 +15,8 @@ define i32 @test1(i32 %num) {
 ; CHECK-NEXT:    [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ poison, [[FOR_INC]] ]
 ; CHECK-NEXT:    switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [
-; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
-; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       for.body.jt2:
 ; CHECK-NEXT:    [[COUNT_JT2:%.*]] = phi i32 [ [[INC_JT2:%.*]], [[FOR_INC_JT2:%.*]] ]
@@ -91,8 +91,8 @@ define i32 @test2(i32 %num) {
 ; CHECK-NEXT:    [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ poison, [[FOR_INC]] ]
 ; CHECK-NEXT:    switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [
-; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
-; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       for.body.jt3:
 ; CHECK-NEXT:    [[COUNT_JT3:%.*]] = phi i32 [ [[INC_JT3:%.*]], [[FOR_INC_JT3:%.*]] ]
@@ -192,8 +192,8 @@ define i32 @test3(i32 %num) {
 ; CHECK-NEXT:    [[COUNT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[STATE:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ poison, [[FOR_INC]] ]
 ; CHECK-NEXT:    switch i32 [[STATE]], label [[FOR_INC_JT1:%.*]] [
-; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
-; CHECK-NEXT:    i32 2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       for.body.jt4:
 ; CHECK-NEXT:    [[COUNT_JT4:%.*]] = phi i32 [ [[INC_JT4:%.*]], [[FOR_INC_JT4:%.*]] ]
@@ -316,3 +316,56 @@ for.cond:                                         ; preds = %lor.end, %entry
 lor.end:                                          ; preds = %for.cond
   br label %for.cond
 }
+
+define void @pr65222(i32 %flags, i1 %cmp, i1 %tobool.not) {
+; CHECK-LABEL: @pr65222(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    br i1 [[CMP:%.*]], label [[THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT:%.*]], label [[SI_UNFOLD_TRUE:%.*]], label [[SI_UNFOLD_FALSE:%.*]]
+; CHECK:       si.unfold.true:
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_END]], label [[SI_UNFOLD_FALSE2:%.*]]
+; CHECK:       si.unfold.false:
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_END]], label [[SI_UNFOLD_FALSE1:%.*]]
+; CHECK:       si.unfold.false1:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       si.unfold.false2:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[UNFOLDED:%.*]] = phi i32 [ [[FLAGS:%.*]], [[WHILE_COND]] ], [ 3, [[SI_UNFOLD_TRUE]] ], [ 2, [[SI_UNFOLD_FALSE]] ], [ 0, [[SI_UNFOLD_FALSE1]] ], [ 1, [[SI_UNFOLD_FALSE2]] ]
+; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ [[FLAGS]], [[WHILE_COND]] ], [ 0, [[SI_UNFOLD_TRUE]] ], [ 0, [[SI_UNFOLD_FALSE]] ], [ 0, [[SI_UNFOLD_FALSE1]] ], [ 0, [[SI_UNFOLD_FALSE2]] ]
+; CHECK-NEXT:    switch i32 [[UNFOLDED]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       sw.bb:
+; CHECK-NEXT:    br label [[WHILE_COND]]
+;
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %sw.bb147, %if.end, %entry
+  br i1 %cmp, label %then, label %if.end
+
+then:                                        ; preds = %while.cond
+  %cond = select i1 %cmp, i32 2, i32 0
+  %cond1 = select i1 %cmp, i32 3, i32 1
+  %tounfold = select i1 %tobool.not, i32 %cond1, i32 %cond
+  br label %if.end
+
+if.end:                                        ; preds = %then, %while.cond
+  %unfolded = phi i32 [ %tounfold, %then ], [ %flags, %while.cond ]
+  %other = phi i32 [ 0, %then ], [ %flags, %while.cond ]
+  switch i32 %unfolded, label %unreachable [
+  i32 0, label %sw.bb
+  ]
+
+unreachable:
+  unreachable
+
+sw.bb:                                         ; preds = %if.end
+  br label %while.cond
+}

>From 284c6990c133ed88f32de111accacc9f55a7a51d Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers at users.noreply.github.com>
Date: Fri, 3 Nov 2023 12:49:07 -0700
Subject: [PATCH 69/76] [CalcSpillWeights] don't mark live intervals with
 spillable inlineasm ops as having infinite spill weight (#70747)

This is necessary for RegAllocGreedy support for memory folding inline
asm that uses "rm" constraints.

Thanks to @qcolombet for the suggestion.

Link: https://github.com/llvm/llvm-project/issues/20571
---
 llvm/lib/CodeGen/CalcSpillWeights.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 6e98e2384ef975f..bf114921a7d220a 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -146,6 +146,17 @@ void VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &LI) {
   LI.setWeight(Weight);
 }
 
+static bool canMemFoldInlineAsm(LiveInterval &LI,
+                                const MachineRegisterInfo &MRI) {
+  for (const MachineOperand &MO : MRI.reg_operands(LI.reg())) {
+    const MachineInstr *MI = MO.getParent();
+    if (MI->isInlineAsm() && MI->mayFoldInlineAsmRegOp(MI->getOperandNo(&MO)))
+      return true;
+  }
+
+  return false;
+}
+
 float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
                                        SlotIndex *End) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -315,7 +326,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
   // into instruction itself makes perfect sense.
   if (ShouldUpdateLI && LI.isZeroLength(LIS.getSlotIndexes()) &&
       !LI.isLiveAtIndexes(LIS.getRegMaskSlots()) &&
-      !isLiveAtStatepointVarArg(LI)) {
+      !isLiveAtStatepointVarArg(LI) && !canMemFoldInlineAsm(LI, MRI)) {
     LI.markNotSpillable();
     return -1.0;
   }

>From 32e35b21b5971cc939b1de1194145d9b934fcb54 Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs at fb.com>
Date: Fri, 3 Nov 2023 12:52:16 -0700
Subject: [PATCH 70/76] [BPF] Skip modifiers for __builtin_btf_type_id() local
 type (#71094)

BPF upstream reported an inconsistent behavior w.r.t. BPF_TYPE_ID_LOCAL
vs. BPF_TYPE_ID_TARGET (or BPF_TYPE_ID_REMOTE in LLVM terminology).

For BPF_TYPE_ID_TARGET, all modifiers (like 'const' and 'volatile') are
ignored in the final type encoding. For example, for type
 'const struct foo', the eventually encoding in BTF relocation
is 'struct foo'. This faciliates libbpf to match corresponding kernel
types with considering any modifiers.

Currently behavior for BPF_TYPE_ID_LOCAL is different. It will encode
'const struct foo' in BTF relocation and such discrepancy confused users
([1]).

This patch fixed this discrepancy by making BPF_TYPE_ID_LOCAL BTF type
representation the sams as BPF_TYPE_ID_TARGET. This should have minimum
user impact since ultimately user wants to get a real time not a 'const'
type modifier.

The selftest builtin-btf-type-id-2.ll is used to test BPF_TYPE_ID_TARGET
with 'const' modifier. Adapt the same test for BPF_TYPE_ID_LOCAL. And
the below diff shows now both BPF_TYPE_ID_LOCAL and BPF_TYPE_ID_TARGET
produces the same type:

$ diff test/CodeGen/BPF/BTF/builtin-btf-type-id-2.ll
test/CodeGen/BPF/BTF/builtin-btf-type-id-local.ll
--- test/CodeGen/BPF/BTF/builtin-btf-type-id-2.ll 2023-07-30
16:58:20.657528310 -0700
+++ test/CodeGen/BPF/BTF/builtin-btf-type-id-local.ll 2023-11-02
10:23:25.356959008 -0700
  @@ -6,7 +6,7 @@
   ;     int a;
   ;   };
   ;   int test(void) {
  -;     return __builtin_btf_type_id(*(const struct s *)0, 1);
  +;     return __builtin_btf_type_id(*(const struct s *)0, 0);
   ;   }
   ; Compilation flag:
; clang -target bpf -O2 -g -S -emit-llvm -Xclang -disable-llvm-passes
test.c
  $

[1]
https://lore.kernel.org/bpf/CAN+4W8h3yDjkOLJPiuKVKTpj_08pBz8ke6vN=Lf8gcA=iYBM-g@mail.gmail.com/

Co-authored-by: Yonghong Song <yonghong.song at linux.dev>
---
 llvm/lib/Target/BPF/BPFPreserveDIType.cpp     | 19 ++---
 .../BPF/BTF/builtin-btf-type-id-local.ll      | 73 +++++++++++++++++++
 2 files changed, 83 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id-local.ll

diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index 78e1bf90f1bd524..fc4fb4d8f8001b3 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -86,15 +86,16 @@ static bool BPFPreserveDITypeImpl(Function &F) {
       Reloc = BTF::BTF_TYPE_ID_LOCAL;
     } else {
       Reloc = BTF::BTF_TYPE_ID_REMOTE;
-      DIType *Ty = cast<DIType>(MD);
-      while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
-        unsigned Tag = DTy->getTag();
-        if (Tag != dwarf::DW_TAG_const_type &&
-            Tag != dwarf::DW_TAG_volatile_type)
-          break;
-        Ty = DTy->getBaseType();
-      }
+    }
+    DIType *Ty = cast<DIType>(MD);
+    while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+      unsigned Tag = DTy->getTag();
+      if (Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type)
+        break;
+      Ty = DTy->getBaseType();
+    }
 
+    if (Reloc == BTF::BTF_TYPE_ID_REMOTE) {
       if (Ty->getName().empty()) {
         if (isa<DISubroutineType>(Ty))
           report_fatal_error(
@@ -102,8 +103,8 @@ static bool BPFPreserveDITypeImpl(Function &F) {
         else
           report_fatal_error("Empty type name for BTF_TYPE_ID_REMOTE reloc");
       }
-      MD = Ty;
     }
+    MD = Ty;
 
     BasicBlock *BB = Call->getParent();
     IntegerType *VarType = Type::getInt64Ty(BB->getContext());
diff --git a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id-local.ll b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id-local.ll
new file mode 100644
index 000000000000000..763b7345dcb8330
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id-local.ll
@@ -0,0 +1,73 @@
+; RUN: opt -O2 -mtriple=bpf-pc-linux -S -o %t1 %s
+; RUN: llc -filetype=asm -o - %t1 | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mattr=+alu32 -filetype=asm -o - %t1 | FileCheck -check-prefixes=CHECK %s
+; Source code:
+;   struct s {
+;     int a;
+;   };
+;   int test(void) {
+;     return __builtin_btf_type_id(*(const struct s *)0, 0);
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -g -S -emit-llvm -Xclang -disable-llvm-passes test.c
+
+; Function Attrs: nounwind
+define dso_local i32 @test() #0 !dbg !7 {
+entry:
+  %0 = call i64 @llvm.bpf.btf.type.id(i32 0, i64 1), !dbg !11, !llvm.preserve.access.index !12
+  %conv = trunc i64 %0 to i32, !dbg !11
+  ret i32 %conv, !dbg !16
+}
+
+; CHECK:             .long   1                               # BTF_KIND_INT(id = 2)
+; CHECK-NEXT:        .long   16777216                        # 0x1000000
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   16777248                        # 0x1000020
+
+; CHECK:             .long   16                              # BTF_KIND_STRUCT(id = 4)
+; CHECK-NEXT:        .long   67108865                        # 0x4000001
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   18
+; CHECK-NEXT:        .long   2
+
+; CHECK:             .ascii  "int"                           # string offset=1
+; CHECK:             .ascii  ".text"                         # string offset=10
+; CHECK:             .byte   115                             # string offset=16
+; CHECK:             .byte   97                              # string offset=18
+; CHECK:             .byte   48                              # string offset=20
+
+; CHECK:             .long   16                              # FieldReloc
+; CHECK-NEXT:        .long   10                              # Field reloc section string offset=10
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   20
+; CHECK-NEXT:        .long   7
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.bpf.btf.type.id(i32, i64) #1
+
+attributes #0 = { nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0 (https://github.com/llvm/llvm-project.git 9783e2098800b954c55ae598a1ce5c4b93444fc0)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp/home/yhs/bpf/test")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 13.0.0 (https://github.com/llvm/llvm-project.git 9783e2098800b954c55ae598a1ce5c4b93444fc0)"}
+!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 4, type: !8, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DILocation(line: 5, column: 10, scope: !7)
+!12 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !13)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s", file: !1, line: 1, size: 32, elements: !14)
+!14 = !{!15}
+!15 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !13, file: !1, line: 2, baseType: !10, size: 32)
+!16 = !DILocation(line: 5, column: 3, scope: !7)

>From 76b1a7c91651b8c3516913ce9b11c05147ec46d2 Mon Sep 17 00:00:00 2001
From: Ryan Prichard <rprichard at google.com>
Date: Fri, 3 Nov 2023 12:54:55 -0700
Subject: [PATCH 71/76] [libc++][Android] Explicitly declare low-level lib
 existence (#70534)

Android's librt and libpthread functionality is part of libc.{a,so}
instead. The atomic APIs are part of the compiler-rt builtins archive.
Android does have libdl.

Android's libc.so has `__cxa_thread_atexit_impl` starting in API 23, and
the oldest supported API is 21, so continue using feature detection for
that API.

These settings need to be declared explicitly for the sake of the fuzzer
library's custom libc++ build `add_custom_libcxx`. That macro builds
libc++ using `-DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY`, which
breaks the feature detection.
---
 libcxx/cmake/config-ix.cmake    | 4 ++++
 libcxxabi/cmake/config-ix.cmake | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/libcxx/cmake/config-ix.cmake b/libcxx/cmake/config-ix.cmake
index 9fed861a4e193c5..a365517936e7565 100644
--- a/libcxx/cmake/config-ix.cmake
+++ b/libcxx/cmake/config-ix.cmake
@@ -107,6 +107,10 @@ elseif(FUCHSIA)
   set(LIBCXX_HAS_PTHREAD_LIB NO)
   set(LIBCXX_HAS_RT_LIB NO)
   check_library_exists(atomic __atomic_fetch_add_8 "" LIBCXX_HAS_ATOMIC_LIB)
+elseif(ANDROID)
+  set(LIBCXX_HAS_PTHREAD_LIB NO)
+  set(LIBCXX_HAS_RT_LIB NO)
+  set(LIBCXX_HAS_ATOMIC_LIB NO)
 else()
   check_library_exists(pthread pthread_create "" LIBCXX_HAS_PTHREAD_LIB)
   check_library_exists(rt clock_gettime "" LIBCXX_HAS_RT_LIB)
diff --git a/libcxxabi/cmake/config-ix.cmake b/libcxxabi/cmake/config-ix.cmake
index 702fe7d1d72f777..39b9284b780ebcf 100644
--- a/libcxxabi/cmake/config-ix.cmake
+++ b/libcxxabi/cmake/config-ix.cmake
@@ -95,6 +95,11 @@ if(FUCHSIA)
   set(LIBCXXABI_HAS_PTHREAD_LIB NO)
   check_library_exists(c __cxa_thread_atexit_impl ""
     LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL)
+elseif(ANDROID)
+  set(LIBCXXABI_HAS_DL_LIB YES)
+  set(LIBCXXABI_HAS_PTHREAD_LIB NO)
+  check_library_exists(c __cxa_thread_atexit_impl ""
+    LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL)
 else()
   check_library_exists(dl dladdr "" LIBCXXABI_HAS_DL_LIB)
   check_library_exists(pthread pthread_once "" LIBCXXABI_HAS_PTHREAD_LIB)

>From 71a13675dad588712094fb09c655570974f98540 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl at users.noreply.github.com>
Date: Fri, 3 Nov 2023 12:57:41 -0700
Subject: [PATCH 72/76] [dsymutil] Filter our swiftinterface files from the
 toolchain. (#71205)

Dsymutil already avoids copying textual Swift interface files from the
SDK, since any consumer would have to have a matching SDK installed
anyway. It should also do the same thing with interfaces found in the
toolchain itself, which includes the compiler built-in libraries such as
Swift (=the standard library), and _Concurrency.

rdar://117881604
---
 llvm/lib/DWARFLinker/DWARFLinker.cpp          | 19 ++++++
 .../DWARFLinkerCompileUnit.cpp                | 19 ++++++
 .../tools/dsymutil/Inputs/swift-interface.s   | 63 ++++++++++++-------
 .../tools/dsymutil/X86/swift-interface.test   |  4 +-
 4 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index 80a4e2adefa6cb6..d891e7205aa6cc7 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -177,6 +177,20 @@ static void resolveRelativeObjectPath(SmallVectorImpl<char> &Buf, DWARFDie CU) {
   sys::path::append(Buf, dwarf::toString(CU.find(dwarf::DW_AT_comp_dir), ""));
 }
 
+/// Make a best effort to guess the
+/// Xcode.app/Contents/Developer/Toolchains/ path from an SDK path.
+static SmallString<128> guessToolchainBaseDir(StringRef SysRoot) {
+  SmallString<128> Result;
+  // Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
+  StringRef Base = sys::path::parent_path(SysRoot);
+  if (sys::path::filename(Base) != "SDKs")
+    return Result;
+  Base = sys::path::parent_path(Base);
+  Result = Base;
+  sys::path::append(Result, "Toolchains");
+  return Result;
+}
+
 /// Collect references to parseable Swift interfaces in imported
 /// DW_TAG_module blocks.
 static void analyzeImportedModule(
@@ -198,6 +212,11 @@ static void analyzeImportedModule(
     SysRoot = CU.getSysRoot();
   if (!SysRoot.empty() && Path.startswith(SysRoot))
     return;
+  // Don't track interfaces that are part of the toolchain.
+  // For example: Swift, _Concurrency, ...
+  SmallString<128> Toolchain = guessToolchainBaseDir(SysRoot);
+  if (!Toolchain.empty() && Path.startswith(Toolchain))
+    return;
   std::optional<const char *> Name =
       dwarf::toString(DIE.find(dwarf::DW_AT_name));
   if (!Name)
diff --git a/llvm/lib/DWARFLinkerParallel/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinkerParallel/DWARFLinkerCompileUnit.cpp
index 96017bac31342aa..7cd9776227836ce 100644
--- a/llvm/lib/DWARFLinkerParallel/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinkerParallel/DWARFLinkerCompileUnit.cpp
@@ -167,6 +167,20 @@ void CompileUnit::cleanupDataAfterClonning() {
   getOrigUnit().clear();
 }
 
+/// Make a best effort to guess the
+/// Xcode.app/Contents/Developer/Toolchains/ path from an SDK path.
+static SmallString<128> guessToolchainBaseDir(StringRef SysRoot) {
+  SmallString<128> Result;
+  // Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
+  StringRef Base = sys::path::parent_path(SysRoot);
+  if (sys::path::filename(Base) != "SDKs")
+    return Result;
+  Base = sys::path::parent_path(Base);
+  Result = Base;
+  sys::path::append(Result, "Toolchains");
+  return Result;
+}
+
 /// Collect references to parseable Swift interfaces in imported
 /// DW_TAG_module blocks.
 void CompileUnit::analyzeImportedModule(const DWARFDebugInfoEntry *DieEntry) {
@@ -187,6 +201,11 @@ void CompileUnit::analyzeImportedModule(const DWARFDebugInfoEntry *DieEntry) {
     SysRoot = getSysRoot();
   if (!SysRoot.empty() && Path.startswith(SysRoot))
     return;
+  // Don't track interfaces that are part of the toolchain.
+  // For example: Swift, _Concurrency, ...
+  SmallString<128> Toolchain = guessToolchainBaseDir(SysRoot);
+  if (!Toolchain.empty() && Path.startswith(Toolchain))
+    return;
   if (std::optional<DWARFFormValue> Val = find(DieEntry, dwarf::DW_AT_name)) {
     Expected<const char *> Name = Val->getAsCString();
     if (!Name) {
diff --git a/llvm/test/tools/dsymutil/Inputs/swift-interface.s b/llvm/test/tools/dsymutil/Inputs/swift-interface.s
index a69f008a6cf3e9d..099f8396d40a7d5 100644
--- a/llvm/test/tools/dsymutil/Inputs/swift-interface.s
+++ b/llvm/test/tools/dsymutil/Inputs/swift-interface.s
@@ -15,17 +15,19 @@
 ##; !llvm.dbg.cu = !{!0}
 ##; !swift.module.flags = !{!14}
 ##; !llvm.module.flags = !{!20, !21, !24}
-##;  
-##; !0 = distinct !DICompileUnit(language: DW_LANG_Swift, file: !1, isOptimized: false, runtimeVersion: 5, emissionKind: FullDebug, enums: !2, imports: !3, sysroot: "/SDK")
+##;
+##; !0 = distinct !DICompileUnit(language: DW_LANG_Swift, file: !1, isOptimized: false, runtimeVersion: 5, emissionKind: FullDebug, enums: !2, imports: !3, sysroot: "/Xcode.app/Contents/Developer/SDKs/MacOSX.sdk")
 ##; !1 = !DIFile(filename: "ParseableInterfaceImports.swift", directory: "/")
 ##; !2 = !{}
-##; !3 = !{!4, !6, !8}
+##; !3 = !{!4, !6, !8, !10}
 ##; !4 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !1, entity: !5, file: !1)
 ##; !5 = !DIModule(scope: null, name: "Foo", includePath: "/Foo/x86_64.swiftinterface")
 ##; !6 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !1, entity: !7, file: !1)
-##; !7 = !DIModule(scope: null, name: "Swift", includePath: "/SDK/Swift.swiftmodule/x86_64.swiftinterface")
-##; !8 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !1, entity: !7, file: !1)
-##; !9 = !DIModule(scope: null, name: "Foundation", includePath: "/SDK/Foundation.swiftmodule")
+##; !7 = !DIModule(scope: null, name: "Swift", includePath: "/Xcode.app/Contents/Developer/SDKs/MacOSX.sdk/Swift.swiftmodule/x86_64.swiftinterface")
+##; !8 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !1, entity: !9, file: !1)
+##; !9 = !DIModule(scope: null, name: "Foundation", includePath: "/Xcode.app/Contents/Developer/SDKs/MacOSX.sdk/Foundation.swiftmodule")
+##; !10 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !1, entity: !11, file: !1)
+##; !11 = !DIModule(scope: null, name: "_Concurrency", includePath: "/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/swift/macosx/_Concurrency.swiftmodule/x86_64-apple-macos.swiftinterface")
 ##; !14 = !{!"standard-library", i1 false}
 ##; !20 = !{i32 2, !"Dwarf Version", i32 4}
 ##; !21 = !{i32 2, !"Debug Info Version", i32 3}
@@ -36,6 +38,7 @@
 ##; !35 = !DILocation(line: 0, scope: !36)
 ##; !36 = !DILexicalBlockFile(scope: !29, file: !37, discriminator: 0)
 ##; !37 = !DIFile(filename: "<compiler-generated>", directory: "")
+
 	.section	__TEXT,__text,regular,pure_instructions
 	.macosx_version_min 10, 9
 	.globl	_main                           ## -- Begin function main
@@ -145,21 +148,21 @@ Ldebug_info_start0:
 .set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
 	.long	Lset1
 	.byte	8                               ## Address Size (in bytes)
-	.byte	1                               ## Abbrev [1] 0xb:0x60 DW_TAG_compile_unit
+	.byte	1                               ## Abbrev [1] 0xb:0x77 DW_TAG_compile_unit
 	.long	0                               ## DW_AT_producer
 	.short	30                              ## DW_AT_language
 	.long	1                               ## DW_AT_name
 	.long	33                              ## DW_AT_LLVM_sysroot
 .set Lset2, Lline_table_start0-Lsection_line ## DW_AT_stmt_list
 	.long	Lset2
-	.long	38                              ## DW_AT_comp_dir
+	.long	79                              ## DW_AT_comp_dir
 	.byte	5                               ## DW_AT_APPLE_major_runtime_vers
 	.quad	Lfunc_begin0                    ## DW_AT_low_pc
 .set Lset3, Lfunc_end0-Lfunc_begin0     ## DW_AT_high_pc
 	.long	Lset3
 	.byte	2                               ## Abbrev [2] 0x2f:0x23 DW_TAG_module
-	.long	40                              ## DW_AT_name
-	.long	44                              ## DW_AT_LLVM_include_path
+	.long	81                              ## DW_AT_name
+	.long	85                              ## DW_AT_LLVM_include_path
 	.byte	3                               ## Abbrev [3] 0x38:0x19 DW_TAG_subprogram
 	.quad	Lfunc_begin0                    ## DW_AT_low_pc
 .set Lset4, Lfunc_end0-Lfunc_begin0     ## DW_AT_high_pc
@@ -167,8 +170,8 @@ Ldebug_info_start0:
                                         ## DW_AT_APPLE_omit_frame_ptr
 	.byte	1                               ## DW_AT_frame_base
 	.byte	87
-	.long	122                             ## DW_AT_linkage_name
-	.long	122                             ## DW_AT_name
+	.long	112                             ## DW_AT_linkage_name
+	.long	112                             ## DW_AT_name
 	.byte	1                               ## DW_AT_decl_file
 	.byte	1                               ## DW_AT_decl_line
                                         ## DW_AT_external
@@ -176,25 +179,37 @@ Ldebug_info_start0:
 	.byte	4                               ## Abbrev [4] 0x52:0x5 DW_TAG_imported_module
 	.long	47                              ## DW_AT_import
 	.byte	5                               ## Abbrev [5] 0x57:0x9 DW_TAG_module
-	.long	71                              ## DW_AT_name
-	.long	77                              ## DW_AT_LLVM_include_path
+	.long	117                             ## DW_AT_name
+	.long	123                             ## DW_AT_LLVM_include_path
 	.byte	4                               ## Abbrev [4] 0x60:0x5 DW_TAG_imported_module
 	.long	87                              ## DW_AT_import
-	.byte	4                               ## Abbrev [4] 0x65:0x5 DW_TAG_imported_module
-	.long	87                              ## DW_AT_import
+	.byte	5                               ## Abbrev [5] 0x65:0x9 DW_TAG_module
+	.long	209                             ## DW_AT_name
+	.long	220                             ## DW_AT_LLVM_include_path
+	.byte	4                               ## Abbrev [4] 0x6e:0x5 DW_TAG_imported_module
+	.long	101                             ## DW_AT_import
+	.byte	5                               ## Abbrev [5] 0x73:0x9 DW_TAG_module
+	.long	289                             ## DW_AT_name
+	.long	302                             ## DW_AT_LLVM_include_path
+	.byte	4                               ## Abbrev [4] 0x7c:0x5 DW_TAG_imported_module
+	.long	115                             ## DW_AT_import
 	.byte	0                               ## End Of Children Mark
 Ldebug_info_end0:
 	.section	__DWARF,__debug_str,regular,debug
 Linfo_string:
 	.byte	0                               ## string offset=0
 	.asciz	"ParseableInterfaceImports.swift" ## string offset=1
-	.asciz	"/SDK"                          ## string offset=33
-	.asciz	"/"                             ## string offset=38
-	.asciz	"Foo"                           ## string offset=40
-	.asciz	"/Foo/x86_64.swiftinterface"    ## string offset=44
-	.asciz	"Swift"                         ## string offset=71
-	.asciz	"/SDK/Swift.swiftmodule/x86_64.swiftinterface" ## string offset=77
-	.asciz	"main"                          ## string offset=122
+	.asciz	"/Xcode.app/Contents/Developer/SDKs/MacOSX.sdk" ## string offset=33
+	.asciz	"/"                             ## string offset=79
+	.asciz	"Foo"                           ## string offset=81
+	.asciz	"/Foo/x86_64.swiftinterface"    ## string offset=85
+	.asciz	"main"                          ## string offset=112
+	.asciz	"Swift"                         ## string offset=117
+	.asciz	"/Xcode.app/Contents/Developer/SDKs/MacOSX.sdk/Swift.swiftmodule/x86_64.swiftinterface" ## string offset=123
+	.asciz	"Foundation"                    ## string offset=209
+	.asciz	"/Xcode.app/Contents/Developer/SDKs/MacOSX.sdk/Foundation.swiftmodule" ## string offset=220
+	.asciz	"_Concurrency"                  ## string offset=289
+	.asciz	"/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/swift/macosx/_Concurrency.swiftmodule/x86_64-apple-macos.swiftinterface" ## string offset=302
 	.section	__DWARF,__apple_names,regular,debug
 Lnames_begin:
 	.long	1212240712                      ## Header Magic
@@ -212,7 +227,7 @@ Lnames_begin:
 .set Lset5, LNames0-Lnames_begin        ## Offset in Bucket 0
 	.long	Lset5
 LNames0:
-	.long	122                             ## main
+	.long	112                             ## main
 	.long	1                               ## Num DIEs
 	.long	56
 	.long	0
diff --git a/llvm/test/tools/dsymutil/X86/swift-interface.test b/llvm/test/tools/dsymutil/X86/swift-interface.test
index 6aa6e8c3174f9ef..23b27bd3e12d02e 100644
--- a/llvm/test/tools/dsymutil/X86/swift-interface.test
+++ b/llvm/test/tools/dsymutil/X86/swift-interface.test
@@ -12,9 +12,9 @@
 # RUN: cat %t.dir/swift-interface.dSYM/Contents/Resources/Swift/x86_64/Foo.swiftinterface \
 # RUN:   | FileCheck %s --check-prefix=INTERFACE
 
-# WARNINGS-NOT: cannot copy parseable Swift interface {{.*}}{{Swift|Foundation}}
+# WARNINGS-NOT: cannot copy parseable Swift interface {{.*}}{{Swift|Foundation|_Concurrency}}
 # WARNINGS: cannot copy parseable Swift interface {{.*}}Foo
-# WARNINGS-NOT: cannot copy parseable Swift interface {{.*}}{{Swift|Foundation}}
+# WARNINGS-NOT: cannot copy parseable Swift interface {{.*}}{{Swift|Foundation|_Concurrency}}
 # INTERFACE: module Foo
 
 ---

>From a682a9cfd006c52559387f80398b720d529595d1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv at gmail.com>
Date: Fri, 3 Nov 2023 21:15:46 +0100
Subject: [PATCH 73/76] Revert "Port Swift's merge function pass to llvm:
 merging functions that differ in constants (#68235)"

This reverts commit 19b5495b653a00da7a250f48b4f739fcf2bbe82f.

PR landed without approval, with severe quality issues.
---
 .../IPO/MergeFunctionsIgnoringConst.h         |   42 -
 .../Transforms/Utils/FunctionComparator.h     |    1 -
 .../Utils/FunctionComparatorIgnoringConst.h   |   58 -
 .../Utils/MergeFunctionsIgnoringConst.h       |   29 -
 llvm/lib/Passes/PassBuilder.cpp               |    1 -
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   11 -
 llvm/lib/Passes/PassRegistry.def              |    1 -
 llvm/lib/Transforms/IPO/CMakeLists.txt        |    1 -
 .../IPO/MergeFunctionsIgnoringConst.cpp       | 1399 -----------------
 llvm/lib/Transforms/Utils/CMakeLists.txt      |    1 -
 .../Utils/FunctionComparatorIgnoringConst.cpp |  107 --
 .../MergeFuncIgnoringConst/merge_func.ll      |  532 -------
 .../merge_with_exception.ll                   |  190 ---
 13 files changed, 2373 deletions(-)
 delete mode 100644 llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h
 delete mode 100644 llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h
 delete mode 100644 llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h
 delete mode 100644 llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp
 delete mode 100644 llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp
 delete mode 100644 llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll
 delete mode 100644 llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll

diff --git a/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h b/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h
deleted file mode 100644
index 638d009abf2bffc..000000000000000
--- a/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//===- MergeFunctionsIgnoringConst.h - Merge Functions ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass transforms simple global variables that never have their address
-// taken.  If obviously true, it marks read/write globals as constant, deletes
-// variables only stored to, etc.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_IPO_MERGEFUNCTIONSIGNORINGCONST_H
-#define LLVM_TRANSFORMS_IPO_MERGEFUNCTIONSIGNORINGCONST_H
-
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-
-class Module;
-
-/// Merge functions that differ by constants.
-class MergeFuncIgnoringConstPass
-    : public PassInfoMixin<MergeFuncIgnoringConstPass> {
-  bool PtrAuthEnabled = false;
-  unsigned PtrAuthKey = 0;
-  std::string MergeFuncSuffix = ".Tm";
-
-public:
-  MergeFuncIgnoringConstPass() {}
-  MergeFuncIgnoringConstPass(bool PtrAuthEnabled, unsigned PtrAuthKey,
-                             std::string Suffix)
-      : PtrAuthEnabled(PtrAuthEnabled), PtrAuthKey(PtrAuthKey),
-        MergeFuncSuffix(Suffix) {}
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TRANSFORMS_IPO_MERGEFUNCTIONSIGNORINGCONST_H
diff --git a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
index 1a314b481c72c61..c28f868039a1f7b 100644
--- a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
+++ b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
@@ -379,7 +379,6 @@ class FunctionComparator {
   /// But, we are still not able to compare operands of PHI nodes, since those
   /// could be operands from further BBs we didn't scan yet.
   /// So it's impossible to use dominance properties in general.
-protected:
   mutable DenseMap<const Value*, int> sn_mapL, sn_mapR;
 
   // The global state we will use
diff --git a/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h b/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h
deleted file mode 100644
index 9c7fe3baf2fa0db..000000000000000
--- a/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h
+++ /dev/null
@@ -1,58 +0,0 @@
-//===- FunctionComparatorIgnoringConst.h - Function Comparator --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the FunctionComparatorIgnoringConst class which is used by
-// the MergeFuncIgnoringConst pass for comparing functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATORIGNORINGCONST_H
-#define LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATORIGNORINGCONST_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Utils/FunctionComparator.h"
-#include <set>
-
-namespace llvm {
-
-/// FunctionComparatorIgnoringConst - Compares two functions to determine
-/// whether or not they match when certain constants are ignored.
-class FunctionComparatorIgnoringConst : public FunctionComparator {
-public:
-  FunctionComparatorIgnoringConst(const Function *F1, const Function *F2,
-                                  GlobalNumberState *GN)
-      : FunctionComparator(F1, F2, GN) {}
-
-  int cmpOperandsIgnoringConsts(const Instruction *L, const Instruction *R,
-                                unsigned opIdx);
-
-  int cmpBasicBlocksIgnoringConsts(
-      const BasicBlock *BBL, const BasicBlock *BBR,
-      const std::set<std::pair<int, int>> *InstOpndIndex = nullptr);
-
-  int compareIgnoringConsts(
-      const std::set<std::pair<int, int>> *InstOpndIndex = nullptr);
-
-  int compareConstants(const Constant *L, const Constant *R) const {
-    return cmpConstants(L, R);
-  }
-
-private:
-  /// Scratch index for instruction in order during cmpOperandsIgnoringConsts.
-  int Index = 0;
-};
-
-} // end namespace llvm
-#endif // LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATORIGNORINGCONST_H
diff --git a/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h b/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h
deleted file mode 100644
index e63afbb6bbf1718..000000000000000
--- a/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//===- MergeFunctionsIgnoringConst.h - Merge Functions ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines helpers used in the MergeFunctionsIgnoringConst.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_UTILS_MERGEFUNCTIONSIGNORINGCONST_H
-#define LLVM_TRANSFORMS_UTILS_MERGEFUNCTIONSIGNORINGCONST_H
-
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Operator.h"
-
-using namespace llvm;
-
-bool isEligibleInstrunctionForConstantSharing(const Instruction *I);
-
-bool isEligibleOperandForConstantSharing(const Instruction *I, unsigned OpIdx);
-
-bool isEligibleFunction(Function *F);
-
-Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy);
-#endif // LLVM_TRANSFORMS_UTILS_MERGEFUNCTIONSIGNORINGCONST_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 789ddfcbf529879..0d7cac19d44c3a8 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -123,7 +123,6 @@
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
-#include "llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h"
 #include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
 #include "llvm/Transforms/IPO/PartialInlining.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 20dbd3952beb60f..baea2913338cda7 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -60,7 +60,6 @@
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
-#include "llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h"
 #include "llvm/Transforms/IPO/ModuleInliner.h"
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
 #include "llvm/Transforms/IPO/PartialInlining.h"
@@ -177,10 +176,6 @@ static cl::opt<bool> EnableMergeFunctions(
     "enable-merge-functions", cl::init(false), cl::Hidden,
     cl::desc("Enable function merging as part of the optimization pipeline"));
 
-static cl::opt<bool> EnableMergeFuncIgnoringConst(
-    "enable-merge-func-ignoring-const", cl::init(false), cl::Hidden,
-    cl::desc("Enable function merger that ignores constants"));
-
 static cl::opt<bool> EnablePostPGOLoopRotation(
     "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
     cl::desc("Run the loop rotation transformation after PGO instrumentation"));
@@ -1638,9 +1633,6 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
   MPM.addPass(buildModuleOptimizationPipeline(
       Level, ThinOrFullLTOPhase::ThinLTOPostLink));
 
-  if (EnableMergeFuncIgnoringConst)
-    MPM.addPass(MergeFuncIgnoringConstPass());
-
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
 
@@ -1966,9 +1958,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
-  if (EnableMergeFuncIgnoringConst)
-    MPM.addPass(MergeFuncIgnoringConstPass());
-
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index ba32c64d18423b9..eb51ccef68c827d 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -87,7 +87,6 @@ MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("metarenamer", MetaRenamerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
-MODULE_PASS("mergefunc-ignoring-const", MergeFuncIgnoringConstPass())
 MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
 MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass())
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 4dac04d3369950f..034f1587ae8df44 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -30,7 +30,6 @@ add_llvm_component_library(LLVMipo
   LowerTypeTests.cpp
   MemProfContextDisambiguation.cpp
   MergeFunctions.cpp
-  MergeFunctionsIgnoringConst.cpp
   ModuleInliner.cpp
   OpenMPOpt.cpp
   PartialInlining.cpp
diff --git a/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp b/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp
deleted file mode 100644
index d6ae788ddb9e1a1..000000000000000
--- a/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp
+++ /dev/null
@@ -1,1399 +0,0 @@
-//===--- MergeFunctionsIgnoringConst.cpp - Merge functions ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass looks for similar functions that are mergeable and folds them.
-// The implementation is similar to LLVM's MergeFunctions pass. Instead of
-// merging identical functions, it merges functions which only differ by a few
-// constants in certain instructions.
-// This is copied from Swift's implementation.
-//
-// This pass should run after LLVM's MergeFunctions pass, because it works best
-// if there are no _identical_ functions in the module.
-// Note: it would also work for identical functions but could produce more
-// code overhead than the LLVM pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/StableHashing.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ObjCARCUtil.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/StructuralHash.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h"
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mergefunc-ignoring-const"
-
-STATISTIC(NumFunctionsMergedIgnoringConst, "Number of functions merged");
-STATISTIC(NumThunksWrittenIgnoringConst, "Number of thunks generated");
-
-static cl::opt<bool> EnableAggressiveMergeFunc(
-    "enable-aggressive-mergefunc-ignoringconst", cl::init(false), cl::Hidden,
-    cl::desc("Enable more aggressive function merger"));
-
-static cl::opt<unsigned> NumFunctionsIgnoringConstForSanityCheck(
-    "mergefunc-ignoringconst-sanity",
-    cl::desc("How many functions in module could be used for "
-             "MergeFunctionsIgnoringConst pass sanity check. "
-             "'0' disables this check. Works only with '-debug' key."),
-    cl::init(0), cl::Hidden);
-
-static cl::opt<unsigned> IgnoringConstMergeThreshold(
-    "mergefunc-ignoringconst-threshold",
-    cl::desc("Functions larger than the threshold are considered for merging."
-             "'0' disables function merging at all."),
-    cl::init(15), cl::Hidden);
-
-cl::opt<bool> UseLinkOnceODRLinkageMerging(
-    "use-linkonceodr-linkage-merging", cl::init(false), cl::Hidden,
-    cl::desc(
-        "Use LinkeOnceODR linkage to deduplicate the identical merged function "
-        "(default = off)"));
-
-cl::opt<bool> NoInlineForMergedFunction(
-    "no-inline-merged-function", cl::init(false), cl::Hidden,
-    cl::desc("set noinline for merged function (default = off)"));
-
-static cl::opt<bool>
-    CastArrayType("merge-cast-array-type", cl::init(false), cl::Hidden,
-                  cl::desc("support for casting array type (default = off)"));
-
-static cl::opt<bool> IgnoreMusttailFunction(
-    "ignore-musttail-function", cl::init(false), cl::Hidden,
-    cl::desc(
-        "ignore functions containing callsites with musttail (default = off)"));
-
-static cl::opt<bool> AlwaysCallThunk(
-    "merge-always-call-thunk", cl::init(false), cl::Hidden,
-    cl::desc(
-        "do not replace callsites and always emit a thunk (default = off)"));
-
-static cl::list<std::string> MergeBlockRegexFilters(
-    "merge-block-regex", cl::Optional,
-    cl::desc("Block functions from merging if they match the given "
-             "regular expression"),
-    cl::ZeroOrMore);
-
-static cl::list<std::string> MergeAllowRegexFilters(
-    "merge-allow-regex", cl::Optional,
-    cl::desc("Allow functions from merging if they match the given "
-             "regular expression"),
-    cl::ZeroOrMore);
-
-bool isEligibleInstrunctionForConstantSharing(const Instruction *I) {
-  switch (I->getOpcode()) {
-  case Instruction::Load:
-  case Instruction::Store:
-  case Instruction::Call:
-    return true;
-  default: {
-    if (EnableAggressiveMergeFunc && I->getOpcode() == Instruction::Invoke)
-      return true;
-    return false;
-  }
-  }
-}
-
-/// Returns true if the \OpIdx operand of \p CI is the callee operand.
-static bool isCalleeOperand(const CallBase *CI, unsigned OpIdx) {
-  return &CI->getCalledOperandUse() == &CI->getOperandUse(OpIdx);
-}
-
-static bool canParameterizeCallOperand(const CallBase *CI, unsigned OpIdx) {
-  if (CI->isInlineAsm())
-    return false;
-  Function *Callee = CI->getCalledOperand()
-                         ? dyn_cast_or_null<Function>(
-                               CI->getCalledOperand()->stripPointerCasts())
-                         : nullptr;
-  if (Callee) {
-    if (Callee->isIntrinsic())
-      return false;
-    // objc_msgSend stubs must be called, and can't have their address taken.
-    if (Callee->getName().startswith("objc_msgSend$"))
-      return false;
-  }
-  if (isCalleeOperand(CI, OpIdx) &&
-      CI->getOperandBundle(LLVMContext::OB_ptrauth).has_value()) {
-    // The operand is the callee and it has already been signed. Ignore this
-    // because we cannot add another ptrauth bundle to the call instruction.
-    return false;
-  }
-  return true;
-}
-
-bool isEligibleOperandForConstantSharing(const Instruction *I, unsigned OpIdx) {
-  assert(OpIdx < I->getNumOperands() && "Invalid operand index");
-
-  if (!isEligibleInstrunctionForConstantSharing(I))
-    return false;
-
-  auto Opnd = I->getOperand(OpIdx);
-  if (!isa<Constant>(Opnd))
-    return false;
-
-  if (const auto *CI = dyn_cast<CallBase>(I))
-    return canParameterizeCallOperand(CI, OpIdx);
-
-  return true;
-}
-
-namespace {
-
-/// MergeFuncIgnoringConst finds functions which only differ by constants in
-/// certain instructions, e.g. resulting from specialized functions of layout
-/// compatible types.
-/// Such functions are merged by replacing the differing constants by a
-/// parameter. The original functions are replaced by thunks which call the
-/// merged function with the specific argument constants.
-///
-class MergeFuncIgnoringConstImpl {
-public:
-  MergeFuncIgnoringConstImpl(bool PtrAuthEnabled, unsigned PtrAuthKey,
-                             std::string Suffix)
-      : FnTree(FunctionNodeCmp(&GlobalNumbers)), PtrAuthEnabled(PtrAuthEnabled),
-        PtrAuthKey(PtrAuthKey), MergeFuncSuffix(Suffix) {}
-
-  bool runImpl(Module &M);
-
-private:
-  struct FunctionEntry;
-
-  /// Describes the set of functions which are considered as "equivalent" (i.e.
-  /// only differing by some constants).
-  struct EquivalenceClass {
-    /// The single-linked list of all functions which are a member of this
-    /// equivalence class.
-    FunctionEntry *First;
-
-    /// A very cheap hash, used to early exit if functions do not match.
-    llvm::IRHash Hash;
-
-  public:
-    // Note the hash is recalculated potentially multiple times, but it is
-    // cheap.
-    EquivalenceClass(FunctionEntry *First)
-        : First(First), Hash(StructuralHash(*First->F)) {
-      assert(!First->Next);
-    }
-  };
-
-  /// The function comparison operator is provided here so that FunctionNodes do
-  /// not need to become larger with another pointer.
-  class FunctionNodeCmp {
-    GlobalNumberState *GlobalNumbers;
-
-  public:
-    FunctionNodeCmp(GlobalNumberState *GN) : GlobalNumbers(GN) {}
-    bool operator()(const EquivalenceClass &LHS,
-                    const EquivalenceClass &RHS) const {
-      // Order first by hashes, then full function comparison.
-      if (LHS.Hash != RHS.Hash)
-        return LHS.Hash < RHS.Hash;
-      FunctionComparatorIgnoringConst FCmp(LHS.First->F, RHS.First->F,
-                                           GlobalNumbers);
-      return FCmp.compareIgnoringConsts() == -1;
-    }
-  };
-  using FnTreeType = std::set<EquivalenceClass, FunctionNodeCmp>;
-
-  ///
-  struct FunctionEntry {
-    FunctionEntry(Function *F, FnTreeType::iterator I)
-        : F(F), Next(nullptr), NumUnhandledCallees(0), TreeIter(I),
-          IsMerged(false) {}
-
-    /// Back-link to the function.
-    AssertingVH<Function> F;
-
-    /// The next function in its equivalence class.
-    FunctionEntry *Next;
-
-    /// The number of not-yet merged callees. Used to process the merging in
-    /// bottom-up call order.
-    /// This is only valid in the first entry of an equivalence class. The
-    /// counts of all functions in an equivalence class are accumulated in the
-    /// first entry.
-    int NumUnhandledCallees;
-
-    /// The iterator of the function's equivalence class in the FnTree.
-    /// It's FnTree.end() if the function is not in an equivalence class.
-    FnTreeType::iterator TreeIter;
-
-    /// True if this function is already a thunk, calling the merged function.
-    bool IsMerged;
-  };
-
-  /// Describes an operator of a specific instruction.
-  struct OpLocation {
-    Instruction *I;
-    unsigned OpIndex;
-  };
-
-  /// Information for a function. Used during merging.
-  struct FunctionInfo {
-
-    FunctionInfo(Function *F)
-        : F(F), CurrentInst(nullptr), NumParamsNeeded(0) {}
-
-    void init() {
-      CurrentInst = &*F->begin()->begin();
-      NumParamsNeeded = 0;
-    }
-
-    /// Advances the current instruction to the next instruction.
-    void nextInst() {
-      assert(CurrentInst);
-      if (CurrentInst->isTerminator()) {
-        auto BlockIter = std::next(CurrentInst->getParent()->getIterator());
-        if (BlockIter == F->end()) {
-          CurrentInst = nullptr;
-          return;
-        }
-        CurrentInst = &*BlockIter->begin();
-        return;
-      }
-      CurrentInst = &*std::next(CurrentInst->getIterator());
-    }
-
-    /// Returns true if the operand \p OpIdx of the current instruction is the
-    /// callee of a call, which needs to be signed if passed as a parameter.
-    bool needsPointerSigning(unsigned OpIdx) const {
-      if (auto *CI = dyn_cast<CallInst>(CurrentInst))
-        return isCalleeOperand(CI, OpIdx);
-      return false;
-    }
-
-    Function *F;
-
-    /// The current instruction while iterating over all instructions.
-    Instruction *CurrentInst;
-
-    /// Roughly the number of parameters needed if this function would be
-    /// merged with the first function of the equivalence class.
-    int NumParamsNeeded;
-  };
-
-  using FunctionInfos = SmallVector<FunctionInfo, 8>;
-
-  /// Describes a parameter which we create to parameterize the merged function.
-  struct ParamInfo {
-    /// The value of the parameter for all the functions in the equivalence
-    /// class.
-    SmallVector<Constant *, 8> Values;
-
-    /// All uses of the parameter in the merged function.
-    SmallVector<OpLocation, 16> Uses;
-
-    /// The Discriminator for pointer signing.
-    /// Only not null if needsPointerSigning is true.
-    ConstantInt *Discriminator = nullptr;
-
-    /// True if the value is a callee function, which needs to be signed if
-    /// passed as a parameter.
-    bool NeedsPointerSigning = false;
-
-    /// Checks if this parameter can be used to describe an operand in all
-    /// functions of the equivalence class. Returns true if all values match
-    /// the specific instruction operands in all functions.
-    bool matches(const FunctionInfos &FInfos, unsigned OpIdx,
-                 bool PtrAuthEnabled) const {
-      unsigned NumFuncs = FInfos.size();
-      assert(Values.size() == NumFuncs);
-      if (PtrAuthEnabled &&
-          NeedsPointerSigning != FInfos[0].needsPointerSigning(OpIdx)) {
-        return false;
-      }
-      for (unsigned Idx = 0; Idx < NumFuncs; ++Idx) {
-        const FunctionInfo &FI = FInfos[Idx];
-        Constant *C = cast<Constant>(FI.CurrentInst->getOperand(OpIdx));
-        if (Values[Idx] != C)
-          return false;
-      }
-      return true;
-    }
-
-    /// Computes the Discriminator for pointer signing.
-    void computeDiscriminator(LLVMContext &Context) {
-      assert(NeedsPointerSigning);
-      assert(!Discriminator);
-
-      /// Get a hash from the concatenated function names.
-      /// The hash is deterministic, because the order of values depends on the
-      /// order of functions in the module, which is itself deterministic.
-      /// Note that the hash is not part of the ABI, because it's purly used
-      /// for pointer authentication between a module-private caller-callee
-      /// pair.
-      std::string concatenatedCalleeNames;
-      for (Constant *value : Values) {
-        if (auto *GO = dyn_cast<GlobalObject>(value))
-          concatenatedCalleeNames += GO->getName();
-      }
-      uint64_t rawHash = stable_hash_combine_string(concatenatedCalleeNames);
-      IntegerType *discrTy = Type::getInt64Ty(Context);
-      Discriminator = ConstantInt::get(discrTy, (rawHash % 0xFFFF) + 1);
-    }
-  };
-
-  using ParamInfos = SmallVector<ParamInfo, 16>;
-
-  Module *CurrentModule = nullptr;
-
-  GlobalNumberState GlobalNumbers;
-
-  /// A work queue of functions that may have been modified and should be
-  /// analyzed again.
-  std::vector<WeakTrackingVH> Deferred;
-
-  /// The set of all distinct functions. Use the insert() and remove() methods
-  /// to modify it. The map allows efficient lookup and deferring of Functions.
-  FnTreeType FnTree;
-
-  ValueMap<Function *, FunctionEntry *> FuncEntries;
-
-  // Maps a function-pointer / Discriminator pair to a corresponding global in
-  // the llvm.ptrauth section.
-  // This map is used as a cache to not create ptrauth globals twice.
-  DenseMap<std::pair<Constant *, ConstantInt *>, Constant *> PtrAuthGlobals;
-
-  /// True if the architecture has pointer authentication enabled.
-  bool PtrAuthEnabled = false;
-
-  /// The key for pointer authentication.
-  unsigned PtrAuthKey = 0;
-
-  std::string MergeFuncSuffix = ".Tm";
-
-  FunctionEntry *getEntry(Function *F) const { return FuncEntries.lookup(F); }
-
-  bool isInEquivalenceClass(FunctionEntry *FE) const {
-    if (FE->TreeIter != FnTree.end()) {
-      return true;
-    }
-    assert(!FE->Next);
-    assert(FE->NumUnhandledCallees == 0);
-    return false;
-  }
-
-  /// Checks the rules of order relation introduced among functions set.
-  /// Returns true, if sanity check has been passed, and false if failed.
-  bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
-
-  /// Updates the NumUnhandledCallees of all user functions of the equivalence
-  /// class containing \p FE by \p Delta.
-  void updateUnhandledCalleeCount(FunctionEntry *FE, int Delta);
-
-  bool tryMergeEquivalenceClass(FunctionEntry *FirstInClass);
-
-  FunctionInfo removeFuncWithMostParams(FunctionInfos &FInfos);
-
-  bool deriveParams(ParamInfos &Params, FunctionInfos &FInfos,
-                    unsigned maxParams);
-
-  bool numOperandsDiffer(FunctionInfos &FInfos);
-
-  bool constsDiffer(const FunctionInfos &FInfos, unsigned OpIdx);
-
-  bool tryMapToParameter(FunctionInfos &FInfos, unsigned OpIdx,
-                         ParamInfos &Params, unsigned maxParams);
-
-  void replaceCallWithAddedPtrAuth(CallInst *origCall, Value *newCallee,
-                                   ConstantInt *Discriminator);
-
-  void mergeWithParams(const FunctionInfos &FInfos, ParamInfos &Params);
-  static void dumpMergeInfo(const FunctionInfos &FInfos, unsigned);
-
-  void removeEquivalenceClassFromTree(FunctionEntry *FE);
-
-  void writeThunk(Function *ToFunc, Function *Thunk, const ParamInfos &Params,
-                  unsigned FuncIdx);
-
-  bool isPtrAuthEnabled() const {
-    // TODO: fix pointer authentication
-    return PtrAuthEnabled;
-  }
-
-  ConstantInt *getPtrAuthKey() {
-    // TODO: fix pointer authentication
-    return ConstantInt::get(Type::getInt32Ty(CurrentModule->getContext()),
-                            PtrAuthKey);
-  }
-
-  /// Returns the value of function \p FuncIdx, and signes it if required.
-  Constant *getSignedValue(const ParamInfo &PI, unsigned FuncIdx) {
-    Constant *value = PI.Values[FuncIdx];
-    if (!PI.NeedsPointerSigning)
-      return value;
-
-    auto lookupKey = std::make_pair(value, PI.Discriminator);
-    Constant *&ptrAuthGlobal = PtrAuthGlobals[lookupKey];
-    if (!ptrAuthGlobal) {
-      // TODO: fix pointer authentication
-    }
-    return ptrAuthGlobal;
-  }
-
-  /// Replace all direct calls of Old with calls of New. Will bitcast New if
-  /// necessary to make types match.
-  bool replaceDirectCallers(Function *Old, Function *New,
-                            const ParamInfos &Params, unsigned FuncIdx);
-};
-
-} // end anonymous namespace
-
-bool MergeFuncIgnoringConstImpl::doSanityCheck(
-    std::vector<WeakTrackingVH> &Worklist) {
-  if (const unsigned Max = NumFunctionsIgnoringConstForSanityCheck) {
-    unsigned TripleNumber = 0;
-    bool Valid = true;
-
-    dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
-
-    unsigned i = 0;
-    for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(),
-                                               E = Worklist.end();
-         I != E && i < Max; ++I, ++i) {
-      unsigned j = i;
-      for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max;
-           ++J, ++j) {
-        Function *F1 = cast<Function>(*I);
-        Function *F2 = cast<Function>(*J);
-        int Res1 = FunctionComparatorIgnoringConst(F1, F2, &GlobalNumbers)
-                       .compareIgnoringConsts();
-        int Res2 = FunctionComparatorIgnoringConst(F2, F1, &GlobalNumbers)
-                       .compareIgnoringConsts();
-
-        // If F1 <= F2, then F2 >= F1, otherwise report failure.
-        if (Res1 != -Res2) {
-          dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
-                 << "\n";
-          LLVM_DEBUG(F1->dump());
-          LLVM_DEBUG(F2->dump());
-          Valid = false;
-        }
-
-        if (Res1 == 0)
-          continue;
-
-        unsigned k = j;
-        for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max;
-             ++k, ++K, ++TripleNumber) {
-          if (K == J)
-            continue;
-
-          Function *F3 = cast<Function>(*K);
-          int Res3 = FunctionComparatorIgnoringConst(F1, F3, &GlobalNumbers)
-                         .compareIgnoringConsts();
-          int Res4 = FunctionComparatorIgnoringConst(F2, F3, &GlobalNumbers)
-                         .compareIgnoringConsts();
-
-          bool Transitive = true;
-
-          if (Res1 != 0 && Res1 == Res4) {
-            // F1 > F2, F2 > F3 => F1 > F3
-            Transitive = Res3 == Res1;
-          } else if (Res3 != 0 && Res3 == -Res4) {
-            // F1 > F3, F3 > F2 => F1 > F2
-            Transitive = Res3 == Res1;
-          } else if (Res4 != 0 && -Res3 == Res4) {
-            // F2 > F3, F3 > F1 => F2 > F1
-            Transitive = Res4 == -Res1;
-          }
-
-          if (!Transitive) {
-            dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: "
-                   << TripleNumber << "\n";
-            dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
-                   << Res4 << "\n";
-            LLVM_DEBUG(F1->dump());
-            LLVM_DEBUG(F2->dump());
-            LLVM_DEBUG(F3->dump());
-            Valid = false;
-          }
-        }
-      }
-    }
-
-    dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n";
-    return Valid;
-  }
-  return true;
-}
-
-/// Returns true if functions containing calls to \p F may be merged together.
-static bool mayMergeCallsToFunction(Function &F) {
-  StringRef Name = F.getName();
-
-  // Calls to dtrace probes must generate unique patchpoints.
-  if (Name.startswith("__dtrace"))
-    return false;
-
-  return true;
-}
-
-/// Returns the benefit, which is approximately the size of the function.
-/// Return 0, if the function should not be merged.
-static unsigned getBenefit(Function *F) {
-  unsigned Benefit = 0;
-
-  // We don't want to merge very small functions, because the overhead of
-  // adding creating thunks and/or adding parameters to the call sites
-  // outweighs the benefit.
-  for (BasicBlock &BB : *F) {
-    for (Instruction &I : BB) {
-      if (CallBase *CB = dyn_cast<CallBase>(&I)) {
-        Function *Callee = CB->getCalledFunction();
-        if (Callee && !mayMergeCallsToFunction(*Callee))
-          return 0;
-        if (!Callee || !Callee->isIntrinsic()) {
-          Benefit += 5;
-          continue;
-        }
-      }
-      Benefit += 1;
-    }
-  }
-  return Benefit;
-}
-
-/// Returns true if function \p F is eligible for merging.
-bool isEligibleFunction(Function *F) {
-  if (F->isDeclaration())
-    return false;
-
-  if (F->hasFnAttribute(llvm::Attribute::NoMerge))
-    return false;
-
-  if (F->hasAvailableExternallyLinkage()) {
-    return false;
-  }
-
-  if (F->getFunctionType()->isVarArg()) {
-    return false;
-  }
-
-  // Check against blocklist.
-  if (!MergeBlockRegexFilters.empty()) {
-    StringRef FuncName = F->getName();
-    for (const auto &tRegex : MergeBlockRegexFilters)
-      if (Regex(tRegex).match(FuncName)) {
-        return false;
-      }
-  }
-  // Check against allowlist
-  if (!MergeAllowRegexFilters.empty()) {
-    StringRef FuncName = F->getName();
-    bool found = false;
-    for (const auto &tRegex : MergeAllowRegexFilters)
-      if (Regex(tRegex).match(FuncName)) {
-        found = true;
-        break;
-      }
-    if (!found)
-      return false;
-  }
-
-  if (F->getCallingConv() == CallingConv::SwiftTail)
-    return false;
-
-  // if function contains callsites with musttail, if we merge
-  // it, the merged function will have the musttail callsite, but
-  // the number of parameters can change, thus the parameter count
-  // of the callsite will mismatch with the function itself.
-  if (IgnoreMusttailFunction) {
-    for (const BasicBlock &BB : *F) {
-      for (const Instruction &I : BB) {
-        const auto *CB = dyn_cast<CallBase>(&I);
-        if (CB && CB->isMustTailCall())
-          return false;
-      }
-    }
-  }
-
-  unsigned Benefit = getBenefit(F);
-  if (Benefit < IgnoringConstMergeThreshold) {
-    return false;
-  }
-
-  return true;
-}
-
-bool MergeFuncIgnoringConstImpl::runImpl(Module &M) {
-  if (IgnoringConstMergeThreshold == 0)
-    return false;
-
-  CurrentModule = &M;
-
-  // TODO: fix pointer authentication
-
-  bool Changed = false;
-
-  // All functions in the module, ordered by hash. Functions with a unique
-  // hash value are easily eliminated.
-  std::vector<std::pair<llvm::IRHash, Function *>> HashedFuncs;
-
-  for (Function &Func : M) {
-    if (isEligibleFunction(&Func)) {
-      HashedFuncs.push_back({StructuralHash(Func), &Func});
-    }
-  }
-
-  std::stable_sort(HashedFuncs.begin(), HashedFuncs.end(),
-                   [](const std::pair<llvm::IRHash, Function *> &a,
-                      const std::pair<llvm::IRHash, Function *> &b) {
-                     return a.first < b.first;
-                   });
-
-  std::vector<FunctionEntry> FuncEntryStorage;
-  FuncEntryStorage.reserve(HashedFuncs.size());
-
-  auto S = HashedFuncs.begin();
-  for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) {
-
-    Function *F = I->second;
-    FuncEntryStorage.push_back(FunctionEntry(F, FnTree.end()));
-    FunctionEntry &FE = FuncEntryStorage.back();
-    FuncEntries[F] = &FE;
-
-    // If the hash value matches the previous value or the next one, we must
-    // consider merging it. Otherwise it is dropped and never considered again.
-    if ((I != S && std::prev(I)->first == I->first) ||
-        (std::next(I) != IE && std::next(I)->first == I->first)) {
-      Deferred.push_back(WeakTrackingVH(F));
-    }
-  }
-
-  do {
-    std::vector<WeakTrackingVH> Worklist;
-    Deferred.swap(Worklist);
-
-    LLVM_DEBUG(dbgs() << "======\nbuild tree: worklist-size=" << Worklist.size()
-                      << '\n');
-    LLVM_DEBUG(doSanityCheck(Worklist));
-
-    SmallVector<FunctionEntry *, 8> FuncsToMerge;
-
-    // Insert all candidates into the Worklist.
-    for (WeakTrackingVH &I : Worklist) {
-      if (!I)
-        continue;
-      Function *F = cast<Function>(I);
-      FunctionEntry *FE = getEntry(F);
-      assert(!isInEquivalenceClass(FE));
-
-      std::pair<FnTreeType::iterator, bool> Result = FnTree.insert(FE);
-
-      FE->TreeIter = Result.first;
-      const EquivalenceClass &Eq = *Result.first;
-
-      if (Result.second) {
-        assert(Eq.First == FE);
-        LLVM_DEBUG(dbgs() << "  new in tree: " << F->getName() << '\n');
-      } else {
-        assert(Eq.First != FE);
-        LLVM_DEBUG(dbgs() << "  add to existing: " << F->getName() << '\n');
-        // Add the function to the existing equivalence class.
-        FE->Next = Eq.First->Next;
-        Eq.First->Next = FE;
-        // Schedule for merging if the function's equivalence class reaches the
-        // size of 2.
-        if (!FE->Next)
-          FuncsToMerge.push_back(Eq.First);
-      }
-    }
-    LLVM_DEBUG(dbgs() << "merge functions: tree-size=" << FnTree.size()
-                      << '\n');
-
-    // Figure out the leaf functions. We want to do the merging in bottom-up
-    // call order. This ensures that we don't parameterize on callee function
-    // names if we don't have to (because the callee may be merged).
-    // Note that "leaf functions" refer to the sub-call-graph of functions which
-    // are in the FnTree.
-    for (FunctionEntry *ToMerge : FuncsToMerge) {
-      assert(isInEquivalenceClass(ToMerge));
-      updateUnhandledCalleeCount(ToMerge, 1);
-    }
-
-    // Check if there are any leaf functions at all.
-    bool LeafFound = false;
-    for (FunctionEntry *ToMerge : FuncsToMerge) {
-      if (ToMerge->NumUnhandledCallees == 0)
-        LeafFound = true;
-    }
-    for (FunctionEntry *ToMerge : FuncsToMerge) {
-      if (isInEquivalenceClass(ToMerge)) {
-        // Only merge leaf functions (or all functions if all functions are in
-        // a call cycle).
-        if (ToMerge->NumUnhandledCallees == 0 || !LeafFound) {
-          updateUnhandledCalleeCount(ToMerge, -1);
-          Changed |= tryMergeEquivalenceClass(ToMerge);
-        } else {
-          // Non-leaf functions (i.e. functions in a call cycle) may become
-          // leaf functions in the next iteration.
-          removeEquivalenceClassFromTree(ToMerge);
-        }
-      }
-    }
-  } while (!Deferred.empty());
-
-  FnTree.clear();
-  GlobalNumbers.clear();
-  FuncEntries.clear();
-  PtrAuthGlobals.clear();
-
-  return Changed;
-}
-
-void MergeFuncIgnoringConstImpl::updateUnhandledCalleeCount(FunctionEntry *FE,
-                                                            int Delta) {
-  // Iterate over all functions of FE's equivalence class.
-  do {
-    for (Use &U : FE->F->uses()) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        FunctionEntry *CallerFE = getEntry(I->getFunction());
-        if (CallerFE && CallerFE->TreeIter != FnTree.end()) {
-          // Accumulate the count in the first entry of the equivalence class.
-          FunctionEntry *Head = CallerFE->TreeIter->First;
-          Head->NumUnhandledCallees += Delta;
-        }
-      }
-    }
-    FE = FE->Next;
-  } while (FE);
-}
-
-bool MergeFuncIgnoringConstImpl::tryMergeEquivalenceClass(
-    FunctionEntry *FirstInClass) {
-  // Build the FInfos vector from all functions in the equivalence class.
-  FunctionInfos FInfos;
-  FunctionEntry *FE = FirstInClass;
-  do {
-    FInfos.push_back(FunctionInfo(FE->F));
-    FE->IsMerged = true;
-    FE = FE->Next;
-  } while (FE);
-  assert(FInfos.size() >= 2);
-
-  // Merged or not: in any case we remove the equivalence class from the FnTree.
-  removeEquivalenceClassFromTree(FirstInClass);
-
-  // Contains functions which differ too much from the first function (i.e.
-  // would need too many parameters).
-  FunctionInfos Removed;
-
-  bool Changed = false;
-  int Try = 0;
-
-  unsigned Benefit = getBenefit(FirstInClass->F);
-
-  // The bigger the function, the more parameters are allowed.
-  unsigned maxParams = std::max(4u, Benefit / 100);
-
-  // We need multiple tries if there are some functions in FInfos which differ
-  // too much from the first function in FInfos. But we limit the number of
-  // tries to a small number, because this is quadratic.
-  while (FInfos.size() >= 2 && Try++ < 4) {
-    ParamInfos Params;
-    bool Merged = deriveParams(Params, FInfos, maxParams);
-    if (Merged) {
-      mergeWithParams(FInfos, Params);
-      Changed = true;
-    } else {
-      // We ran out of parameters. Remove the function from the set which
-      // differs most from the first function.
-      Removed.push_back(removeFuncWithMostParams(FInfos));
-    }
-    if (Merged || FInfos.size() < 2) {
-      // Try again with the functions which were removed from the original set.
-      FInfos.swap(Removed);
-      Removed.clear();
-    }
-  }
-  return Changed;
-}
-
-/// Remove the function from \p FInfos which needs the most parameters. Add the
-/// removed function to
-MergeFuncIgnoringConstImpl::FunctionInfo
-MergeFuncIgnoringConstImpl::removeFuncWithMostParams(FunctionInfos &FInfos) {
-  FunctionInfos::iterator MaxIter = FInfos.end();
-  for (auto Iter = FInfos.begin(), End = FInfos.end(); Iter != End; ++Iter) {
-    if (MaxIter == FInfos.end() ||
-        Iter->NumParamsNeeded > MaxIter->NumParamsNeeded) {
-      MaxIter = Iter;
-    }
-  }
-  FunctionInfo Removed = *MaxIter;
-  FInfos.erase(MaxIter);
-  return Removed;
-}
-
-/// Finds the set of parameters which are required to merge the functions in
-/// \p FInfos.
-/// Returns true on success, i.e. the functions in \p FInfos can be merged with
-/// the parameters returned in \p Params.
-bool MergeFuncIgnoringConstImpl::deriveParams(ParamInfos &Params,
-                                              FunctionInfos &FInfos,
-                                              unsigned maxParams) {
-  for (FunctionInfo &FI : FInfos)
-    FI.init();
-
-  FunctionInfo &FirstFI = FInfos.front();
-
-  // Iterate over all instructions synchronously in all functions.
-  do {
-    if (isEligibleInstrunctionForConstantSharing(FirstFI.CurrentInst)) {
-
-      // Here we handle a rare corner case which needs to be explained:
-      // Usually the number of operands match, because otherwise the functions
-      // in FInfos would not be in the same equivalence class. There is only one
-      // exception to that: If the current instruction is a call to a function,
-      // which was merged in the previous iteration (in
-      // tryMergeEquivalenceClass) then the call could be replaced and has more
-      // arguments than the original call.
-      if (numOperandsDiffer(FInfos)) {
-        assert(isa<CallInst>(FirstFI.CurrentInst) &&
-               "only calls are expected to differ in number of operands");
-        return false;
-      }
-
-      for (unsigned OpIdx = 0, NumOps = FirstFI.CurrentInst->getNumOperands();
-           OpIdx != NumOps; ++OpIdx) {
-
-        if (constsDiffer(FInfos, OpIdx)) {
-          // This instruction has operands which differ in at least some
-          // functions. So we need to parameterize it.
-          if (!tryMapToParameter(FInfos, OpIdx, Params, maxParams)) {
-            // We ran out of parameters.
-            return false;
-          }
-        }
-      }
-    }
-    // Go to the next instruction in all functions.
-    for (FunctionInfo &FI : FInfos)
-      FI.nextInst();
-  } while (FirstFI.CurrentInst);
-
-  return true;
-}
-
-/// Returns true if the number of operands of the current instruction differs.
-bool MergeFuncIgnoringConstImpl::numOperandsDiffer(FunctionInfos &FInfos) {
-  unsigned numOps = FInfos[0].CurrentInst->getNumOperands();
-  for (const FunctionInfo &FI : ArrayRef<FunctionInfo>(FInfos).drop_front(1)) {
-    if (FI.CurrentInst->getNumOperands() != numOps)
-      return true;
-  }
-  return false;
-}
-
-/// Returns true if the \p OpIdx's constant operand in the current instruction
-/// does differ in any of the functions in \p FInfos.
-bool MergeFuncIgnoringConstImpl::constsDiffer(const FunctionInfos &FInfos,
-                                              unsigned OpIdx) {
-  Constant *CommonConst = nullptr;
-
-  for (const FunctionInfo &FI : FInfos) {
-    Value *Op = FI.CurrentInst->getOperand(OpIdx);
-    if (auto *C = dyn_cast<Constant>(Op)) {
-      if (!CommonConst) {
-        CommonConst = C;
-      } else if (EnableAggressiveMergeFunc &&
-                 isa<ConstantPointerNull>(CommonConst) &&
-                 isa<ConstantPointerNull>(C)) {
-        // if both are null pointer, and if they are different constants
-        // due to type, still treat them as the same.
-      } else if (C != CommonConst) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-/// Create a new parameter for differing operands or try to reuse an existing
-/// parameter.
-/// Returns true if a parameter could be created or found without exceeding the
-/// maximum number of parameters.
-bool MergeFuncIgnoringConstImpl::tryMapToParameter(FunctionInfos &FInfos,
-                                                   unsigned OpIdx,
-                                                   ParamInfos &Params,
-                                                   unsigned maxParams) {
-  ParamInfo *Matching = nullptr;
-  // Try to find an existing parameter which exactly matches the differing
-  // operands of the current instruction.
-  for (ParamInfo &PI : Params) {
-    if (PI.matches(FInfos, OpIdx, isPtrAuthEnabled())) {
-      Matching = &PI;
-      break;
-    }
-  }
-  if (!Matching) {
-    // We need a new parameter.
-    // Check if we are within the limit.
-    if (Params.size() >= maxParams)
-      return false;
-
-    Params.resize(Params.size() + 1);
-    Matching = &Params.back();
-    // Store the constant values into the new parameter.
-    Constant *FirstC = cast<Constant>(FInfos[0].CurrentInst->getOperand(OpIdx));
-    for (FunctionInfo &FI : FInfos) {
-      Constant *C = cast<Constant>(FI.CurrentInst->getOperand(OpIdx));
-      Matching->Values.push_back(C);
-      if (C != FirstC)
-        FI.NumParamsNeeded += 1;
-    }
-    if (isPtrAuthEnabled())
-      Matching->NeedsPointerSigning = FInfos[0].needsPointerSigning(OpIdx);
-  }
-  /// Remember where the parameter is needed when we build our merged function.
-  Matching->Uses.push_back({FInfos[0].CurrentInst, OpIdx});
-  return true;
-}
-
-/// Copy \p origCall with a \p newCalle and add a ptrauth bundle with \p
-/// Discriminator.
-void MergeFuncIgnoringConstImpl::replaceCallWithAddedPtrAuth(
-    CallInst *origCall, Value *newCallee, ConstantInt *Discriminator) {
-  SmallVector<llvm::OperandBundleDef, 4> bundles;
-  origCall->getOperandBundlesAsDefs(bundles);
-  ConstantInt *key = getPtrAuthKey();
-  llvm::Value *bundleArgs[] = {key, Discriminator};
-  bundles.emplace_back("ptrauth", bundleArgs);
-
-  SmallVector<llvm::Value *, 4> copiedArgs;
-  for (Value *op : origCall->args()) {
-    copiedArgs.push_back(op);
-  }
-
-  auto *newCall =
-      CallInst::Create(origCall->getFunctionType(), newCallee, copiedArgs,
-                       bundles, origCall->getName(), origCall);
-  newCall->setAttributes(origCall->getAttributes());
-  newCall->setTailCallKind(origCall->getTailCallKind());
-  newCall->setCallingConv(origCall->getCallingConv());
-  origCall->replaceAllUsesWith(newCall);
-  origCall->eraseFromParent();
-}
-
-void MergeFuncIgnoringConstImpl::dumpMergeInfo(const FunctionInfos &FInfos,
-                                               unsigned paramSize) {
-  std::set<llvm::IRHash> oHashes;
-  std::vector<std::string> funcLocs;
-  Function *OrigFunc = nullptr;
-  for (const auto &FInfo : FInfos) {
-    OrigFunc = FInfo.F;
-
-    llvm::IRHash origHash = StructuralHash(*OrigFunc);
-    oHashes.insert(origHash);
-
-    // Print debug location.
-    std::string Result;
-    raw_string_ostream DbgLocOS(Result);
-    if (DISubprogram *DIS = OrigFunc->getSubprogram()) {
-      DebugLoc FuncDbgLoc =
-          DILocation::get(DIS->getContext(), DIS->getScopeLine(), 0, DIS);
-      FuncDbgLoc.print(DbgLocOS);
-      DbgLocOS.flush();
-    }
-    std::string singleLine =
-        "# functionLoc " +
-        std::to_string(GlobalValue::getGUID(OrigFunc->getName())) + " " +
-        Result + " " + std::string(OrigFunc->getName()) + "\n";
-    funcLocs.push_back(singleLine);
-  }
-}
-
-/// Merge all functions in \p FInfos by creating thunks which call the single
-/// merged function with additional parameters.
-void MergeFuncIgnoringConstImpl::mergeWithParams(const FunctionInfos &FInfos,
-                                                 ParamInfos &Params) {
-  // We reuse the body of the first function for the new merged function.
-  Function *FirstF = FInfos.front().F;
-
-  // Build the type for the merged function. This will be the type of the
-  // original function (FirstF) but with the additional parameter which are
-  // needed to parameterize the merged function.
-  FunctionType *OrigTy = FirstF->getFunctionType();
-  SmallVector<Type *, 8> ParamTypes(OrigTy->param_begin(), OrigTy->param_end());
-
-  for (const ParamInfo &PI : Params) {
-    ParamTypes.push_back(PI.Values[0]->getType());
-  }
-
-  FunctionType *funcType =
-      FunctionType::get(OrigTy->getReturnType(), ParamTypes, false);
-
-  // Create the new function.
-  Function *NewFunction = Function::Create(funcType, FirstF->getLinkage(),
-                                           FirstF->getName() + MergeFuncSuffix);
-  if (auto *SP = FirstF->getSubprogram())
-    NewFunction->setSubprogram(SP);
-  NewFunction->copyAttributesFrom(FirstF);
-  // NOTE: this function is not externally available, do ensure that we reset
-  // the DLL storage
-  NewFunction->setDLLStorageClass(GlobalValue::DefaultStorageClass);
-  if (UseLinkOnceODRLinkageMerging)
-    NewFunction->setLinkage(GlobalValue::LinkOnceODRLinkage);
-  else
-    NewFunction->setLinkage(GlobalValue::InternalLinkage);
-  if (NoInlineForMergedFunction)
-    NewFunction->addFnAttr(Attribute::NoInline);
-
-  // Insert the new function after the last function in the equivalence class.
-  FirstF->getParent()->getFunctionList().insert(
-      std::next(FInfos[1].F->getIterator()), NewFunction);
-
-  LLVM_DEBUG(dbgs() << "  Merge into " << NewFunction->getName() << '\n');
-
-  // Move the body of FirstF into the NewFunction.
-  NewFunction->splice(NewFunction->begin(), FirstF);
-
-  auto NewArgIter = NewFunction->arg_begin();
-  for (Argument &OrigArg : FirstF->args()) {
-    Argument &NewArg = *NewArgIter++;
-    OrigArg.replaceAllUsesWith(&NewArg);
-  }
-  unsigned numOrigArgs = FirstF->arg_size();
-
-  SmallPtrSet<Function *, 8> SelfReferencingFunctions;
-
-  // Replace all differing operands with a parameter.
-  for (unsigned paramIdx = 0; paramIdx < Params.size(); ++paramIdx) {
-    const ParamInfo &PI = Params[paramIdx];
-    Argument *NewArg = NewFunction->getArg(numOrigArgs + paramIdx);
-
-    if (!PI.NeedsPointerSigning) {
-      for (const OpLocation &OL : PI.Uses) {
-        OL.I->setOperand(OL.OpIndex, NewArg);
-      }
-    }
-    // Collect all functions which are referenced by any parameter.
-    for (Value *V : PI.Values) {
-      if (auto *F = dyn_cast<Function>(V))
-        SelfReferencingFunctions.insert(F);
-    }
-  }
-
-  // Replace all differing operands, which need pointer signing, with a
-  // parameter.
-  // We need to do that after all other parameters, because here we replace
-  // call instructions, which must be live in case it has another constant to
-  // be replaced.
-  for (unsigned paramIdx = 0; paramIdx < Params.size(); ++paramIdx) {
-    ParamInfo &PI = Params[paramIdx];
-    if (PI.NeedsPointerSigning) {
-      PI.computeDiscriminator(NewFunction->getContext());
-      for (const OpLocation &OL : PI.Uses) {
-        auto *origCall = cast<CallInst>(OL.I);
-        Argument *newCallee = NewFunction->getArg(numOrigArgs + paramIdx);
-        replaceCallWithAddedPtrAuth(origCall, newCallee, PI.Discriminator);
-      }
-    }
-  }
-
-  for (unsigned FIdx = 0, NumFuncs = FInfos.size(); FIdx < NumFuncs; ++FIdx) {
-    Function *OrigFunc = FInfos[FIdx].F;
-    // Don't try to replace all callers of functions which are used as
-    // parameters because we must not delete such functions.
-    if (SelfReferencingFunctions.count(OrigFunc) == 0 &&
-        replaceDirectCallers(OrigFunc, NewFunction, Params, FIdx)) {
-      // We could replace all uses (and the function is not externally visible),
-      // so we can delete the original function.
-      auto Iter = FuncEntries.find(OrigFunc);
-      assert(Iter != FuncEntries.end());
-      assert(!isInEquivalenceClass(&*Iter->second));
-      Iter->second->F = nullptr;
-      FuncEntries.erase(Iter);
-      LLVM_DEBUG(dbgs() << "    Erase " << OrigFunc->getName() << '\n');
-      OrigFunc->eraseFromParent();
-    } else {
-      // Otherwise we need a thunk which calls the merged function.
-      writeThunk(NewFunction, OrigFunc, Params, FIdx);
-    }
-    ++NumFunctionsMergedIgnoringConst;
-  }
-}
-
-/// Remove all functions of \p FE's equivalence class from FnTree. Add them to
-/// Deferred so that we'll look at them in the next round.
-void MergeFuncIgnoringConstImpl::removeEquivalenceClassFromTree(
-    FunctionEntry *FE) {
-  if (!isInEquivalenceClass(FE))
-    return;
-
-  FnTreeType::iterator Iter = FE->TreeIter;
-  FunctionEntry *Unlink = Iter->First;
-  Unlink->NumUnhandledCallees = 0;
-  while (Unlink) {
-    LLVM_DEBUG(dbgs() << "    remove from tree: " << Unlink->F->getName()
-                      << '\n');
-    if (!Unlink->IsMerged)
-      Deferred.emplace_back(Unlink->F);
-    Unlink->TreeIter = FnTree.end();
-    assert(Unlink->NumUnhandledCallees == 0);
-    FunctionEntry *NextEntry = Unlink->Next;
-    Unlink->Next = nullptr;
-    Unlink = NextEntry;
-  }
-  FnTree.erase(Iter);
-}
-
-// Helper for writeThunk,
-// Selects proper bitcast operation,
-// but a bit simpler then CastInst::getCastOpcode.
-Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
-  Type *SrcTy = V->getType();
-  if (SrcTy->isStructTy()) {
-    assert(DestTy->isStructTy());
-    assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
-    Value *Result = UndefValue::get(DestTy);
-    for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
-      Value *Element =
-          createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
-                     DestTy->getStructElementType(I));
-
-      Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
-    }
-    return Result;
-  }
-  assert(!DestTy->isStructTy());
-  if (CastArrayType) {
-    if (auto *SrcAT = dyn_cast<ArrayType>(SrcTy)) {
-      auto *DestAT = dyn_cast<ArrayType>(DestTy);
-      assert(DestAT);
-      assert(SrcAT->getNumElements() == DestAT->getNumElements());
-      Value *Result = UndefValue::get(DestTy);
-      for (unsigned int I = 0, E = SrcAT->getNumElements(); I < E; ++I) {
-        Value *Element =
-            createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
-                       DestAT->getElementType());
-
-        Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
-      }
-      return Result;
-    }
-    assert(!DestTy->isArrayTy());
-  }
-  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
-    return Builder.CreateIntToPtr(V, DestTy);
-  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
-    return Builder.CreatePtrToInt(V, DestTy);
-  else
-    return Builder.CreateBitCast(V, DestTy);
-}
-
-/// Replace \p Thunk with a simple tail call to \p ToFunc. Also add parameters
-/// to the call to \p ToFunc, which are defined by the FuncIdx's value in
-/// \p Params.
-void MergeFuncIgnoringConstImpl::writeThunk(Function *ToFunc, Function *Thunk,
-                                            const ParamInfos &Params,
-                                            unsigned FuncIdx) {
-  // Delete the existing content of Thunk.
-  Thunk->dropAllReferences();
-
-  BasicBlock *BB = BasicBlock::Create(Thunk->getContext(), "", Thunk);
-  IRBuilder<> Builder(BB);
-
-  SmallVector<Value *, 16> Args;
-  unsigned ParamIdx = 0;
-  FunctionType *ToFuncTy = ToFunc->getFunctionType();
-
-  // Add arguments which are passed through Thunk.
-  for (Argument &AI : Thunk->args()) {
-    Args.push_back(createCast(Builder, &AI, ToFuncTy->getParamType(ParamIdx)));
-    ++ParamIdx;
-  }
-  // Add new arguments defined by Params.
-  for (const ParamInfo &PI : Params) {
-    assert(ParamIdx < ToFuncTy->getNumParams());
-    Constant *param = getSignedValue(PI, FuncIdx);
-    Args.push_back(
-        createCast(Builder, param, ToFuncTy->getParamType(ParamIdx)));
-    ++ParamIdx;
-  }
-
-  CallInst *CI = Builder.CreateCall(ToFunc, Args);
-  bool isSwiftTailCall = ToFunc->getCallingConv() == CallingConv::SwiftTail &&
-                         Thunk->getCallingConv() == CallingConv::SwiftTail;
-  CI->setTailCallKind(isSwiftTailCall ? llvm::CallInst::TCK_MustTail
-                                      : llvm::CallInst::TCK_Tail);
-  CI->setCallingConv(ToFunc->getCallingConv());
-  CI->setAttributes(ToFunc->getAttributes());
-  if (Thunk->getReturnType()->isVoidTy()) {
-    Builder.CreateRetVoid();
-  } else {
-    Builder.CreateRet(createCast(Builder, CI, Thunk->getReturnType()));
-  }
-
-  LLVM_DEBUG(dbgs() << "    writeThunk: " << Thunk->getName() << '\n');
-  ++NumThunksWrittenIgnoringConst;
-}
-
-static llvm::AttributeList
-fixUpTypesInByValAndStructRetAttributes(llvm::FunctionType *fnType,
-                                        llvm::AttributeList attrList) {
-  auto &context = fnType->getContext();
-  if (!context.supportsTypedPointers())
-    return attrList;
-
-  for (unsigned i = 0; i < fnType->getNumParams(); ++i) {
-    auto paramTy = fnType->getParamType(i);
-    auto attrListIndex = llvm::AttributeList::FirstArgIndex + i;
-    if (attrList.hasParamAttr(i, llvm::Attribute::StructRet) &&
-        paramTy->getNonOpaquePointerElementType() !=
-            attrList.getParamStructRetType(i))
-      attrList = attrList.replaceAttributeTypeAtIndex(
-          context, attrListIndex, llvm::Attribute::StructRet,
-          paramTy->getNonOpaquePointerElementType());
-    if (attrList.hasParamAttr(i, llvm::Attribute::ByVal) &&
-        paramTy->getNonOpaquePointerElementType() !=
-            attrList.getParamByValType(i))
-      attrList = attrList.replaceAttributeTypeAtIndex(
-          context, attrListIndex, llvm::Attribute::ByVal,
-          paramTy->getNonOpaquePointerElementType());
-  }
-  return attrList;
-}
-
-/// Replace direct callers of Old with New. Also add parameters to the call to
-/// \p New, which are defined by the FuncIdx's value in \p Params.
-bool MergeFuncIgnoringConstImpl::replaceDirectCallers(Function *Old,
-                                                      Function *New,
-                                                      const ParamInfos &Params,
-                                                      unsigned FuncIdx) {
-  bool AllReplaced = true;
-
-  SmallVector<CallInst *, 8> Callers;
-
-  for (Use &U : Old->uses()) {
-    auto *I = dyn_cast<Instruction>(U.getUser());
-    if (!I) {
-      AllReplaced = false;
-      continue;
-    }
-    FunctionEntry *FE = getEntry(I->getFunction());
-    if (FE)
-      removeEquivalenceClassFromTree(FE);
-
-    auto *CI = dyn_cast<CallInst>(I);
-    if (!CI || CI->getCalledOperand() != Old) {
-      AllReplaced = false;
-      continue;
-    }
-    Callers.push_back(CI);
-  }
-  if (!AllReplaced)
-    return false;
-
-  // When AlwaysCallThunk is true, return false so a thunk will be emitted, also
-  // do not replace callsites.
-  if (AlwaysCallThunk)
-    return false;
-
-  for (CallInst *CI : Callers) {
-    auto &Context = New->getContext();
-    auto NewPAL = New->getAttributes();
-
-    SmallVector<Type *, 8> OldParamTypes;
-    SmallVector<Value *, 16> NewArgs;
-    SmallVector<AttributeSet, 8> NewArgAttrs;
-    IRBuilder<> Builder(CI);
-
-    FunctionType *NewFuncTy = New->getFunctionType();
-    (void)NewFuncTy;
-    unsigned ParamIdx = 0;
-
-    // Add the existing parameters.
-    for (Value *OldArg : CI->args()) {
-      NewArgAttrs.push_back(NewPAL.getParamAttrs(ParamIdx));
-      NewArgs.push_back(OldArg);
-      OldParamTypes.push_back(OldArg->getType());
-      ++ParamIdx;
-    }
-    // Add the new parameters.
-    for (const ParamInfo &PI : Params) {
-      assert(ParamIdx < NewFuncTy->getNumParams());
-      Constant *ArgValue = getSignedValue(PI, FuncIdx);
-      assert(ArgValue != Old && "should not try to replace all callers of self "
-                                "referencing functions");
-      NewArgs.push_back(ArgValue);
-      OldParamTypes.push_back(ArgValue->getType());
-      ++ParamIdx;
-    }
-
-    auto *FType = FunctionType::get(Old->getFunctionType()->getReturnType(),
-                                    OldParamTypes, false);
-    auto *FPtrType = PointerType::get(
-        FType, cast<PointerType>(New->getType())->getAddressSpace());
-
-    Value *Callee = ConstantExpr::getBitCast(New, FPtrType);
-    CallInst *NewCI;
-    if (objcarc::hasAttachedCallOpBundle(CI)) {
-      Value *BundleArgs[] = {*objcarc::getAttachedARCFunction(CI)};
-      OperandBundleDef OB("clang.arc.attachedcall", BundleArgs);
-      NewCI = Builder.CreateCall(FType, Callee, NewArgs, {OB});
-    } else {
-      NewCI = Builder.CreateCall(FType, Callee, NewArgs);
-    }
-    NewCI->setCallingConv(CI->getCallingConv());
-    // Don't transfer attributes from the function to the callee. Function
-    // attributes typically aren't relevant to the calling convention or ABI.
-    auto newAttrList = AttributeList::get(Context, /*FnAttrs=*/AttributeSet(),
-                                          NewPAL.getRetAttrs(), NewArgAttrs);
-    newAttrList = fixUpTypesInByValAndStructRetAttributes(FType, newAttrList);
-    NewCI->setAttributes(newAttrList);
-    if (IgnoreMusttailFunction && CI->isMustTailCall()) {
-      // replace a callsite with musttail.
-      llvm::errs() << "callsite has musttail in newF " << New->getName()
-                   << "\n";
-    }
-    NewCI->copyMetadata(*CI);
-    CI->replaceAllUsesWith(NewCI);
-    CI->eraseFromParent();
-  }
-  assert(Old->use_empty() && "should have replaced all uses of old function");
-  return Old->hasLocalLinkage();
-}
-
-PreservedAnalyses MergeFuncIgnoringConstPass::run(Module &M,
-                                                  ModuleAnalysisManager &MAM) {
-  if (MergeFuncIgnoringConstImpl(PtrAuthEnabled, PtrAuthKey, MergeFuncSuffix)
-          .runImpl(M))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
-}
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index 9c320beb09711af..51e8821773c3af3 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -27,7 +27,6 @@ add_llvm_component_library(LLVMTransformUtils
   FixIrreducible.cpp
   FlattenCFG.cpp
   FunctionComparator.cpp
-  FunctionComparatorIgnoringConst.cpp
   FunctionImportUtils.cpp
   GlobalStatus.cpp
   GuardUtils.cpp
diff --git a/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp b/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp
deleted file mode 100644
index 9cfd95345598083..000000000000000
--- a/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-//===--- FunctionComparatorIgnoringConst.cpp - Function Comparator --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h"
-
-using namespace llvm;
-
-int FunctionComparatorIgnoringConst::cmpOperandsIgnoringConsts(
-    const Instruction *L, const Instruction *R, unsigned opIdx) {
-  Value *OpL = L->getOperand(opIdx);
-  Value *OpR = R->getOperand(opIdx);
-
-  int Res = cmpValues(OpL, OpR);
-  if (Res == 0)
-    return Res;
-
-  if (!isa<Constant>(OpL) || !isa<Constant>(OpR))
-    return Res;
-
-  if (!isEligibleOperandForConstantSharing(L, opIdx) ||
-      !isEligibleOperandForConstantSharing(R, opIdx))
-    return Res;
-
-  if (cmpTypes(OpL->getType(), OpR->getType()))
-    return Res;
-
-  return 0;
-}
-
-// Test whether two basic blocks have equivalent behavior.
-int FunctionComparatorIgnoringConst::cmpBasicBlocksIgnoringConsts(
-    const BasicBlock *BBL, const BasicBlock *BBR,
-    const std::set<std::pair<int, int>> *InstOpndIndex) {
-  BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
-  BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
-
-  do {
-    bool needToCmpOperands = true;
-    if (int Res = cmpOperations(&*InstL, &*InstR, needToCmpOperands))
-      return Res;
-    if (needToCmpOperands) {
-      assert(InstL->getNumOperands() == InstR->getNumOperands());
-
-      for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
-        // When a set for (instruction, operand) index pairs is given, we only
-        // ignore constants located at such indices. Otherwise, we precisely
-        // compare the operands.
-        if (InstOpndIndex && !InstOpndIndex->count(std::make_pair(Index, i))) {
-          Value *OpL = InstL->getOperand(i);
-          Value *OpR = InstR->getOperand(i);
-          if (int Res = cmpValues(OpL, OpR))
-            return Res;
-        }
-        if (int Res = cmpOperandsIgnoringConsts(&*InstL, &*InstR, i))
-          return Res;
-        // cmpValues should ensure this is true.
-        assert(cmpTypes(InstL->getOperand(i)->getType(),
-                        InstR->getOperand(i)->getType()) == 0);
-      }
-    }
-    ++Index;
-    ++InstL, ++InstR;
-  } while (InstL != InstLE && InstR != InstRE);
-
-  if (InstL != InstLE && InstR == InstRE)
-    return 1;
-  if (InstL == InstLE && InstR != InstRE)
-    return -1;
-  return 0;
-}
-
-// Test whether the two functions have equivalent behavior.
-int FunctionComparatorIgnoringConst::compareIgnoringConsts(
-    const std::set<std::pair<int, int>> *InstOpndIndex) {
-  beginCompare();
-  Index = 0;
-
-  if (int Res = compareSignature())
-    return Res;
-
-  Function::const_iterator LIter = FnL->begin(), LEnd = FnL->end();
-  Function::const_iterator RIter = FnR->begin(), REnd = FnR->end();
-
-  do {
-    const BasicBlock *BBL = &*LIter;
-    const BasicBlock *BBR = &*RIter;
-
-    if (int Res = cmpValues(BBL, BBR))
-      return Res;
-
-    if (int Res = cmpBasicBlocksIgnoringConsts(BBL, BBR, InstOpndIndex))
-      return Res;
-
-    ++LIter, ++RIter;
-  } while (LIter != LEnd && RIter != REnd);
-
-  return 0;
-}
diff --git a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll
deleted file mode 100644
index 1d84340da417235..000000000000000
--- a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll
+++ /dev/null
@@ -1,532 +0,0 @@
-; RUN: opt -S -mergefunc-ignoringconst-threshold=4 -passes=mergefunc-ignoring-const %s | FileCheck %s
-
- at g1 = external global i32
- at g2 = external global i32
- at g3 = external global i32
- at g4 = external global i32
- at g5 = external global i32
-
-; Test the most trivial example.
-
-; CHECK-LABEL: define i32 @simple_func1(i32 %x, i32 %y)
-; CHECK: %1 = tail call i32 @simple_func1.Tm(i32 %x, i32 %y, ptr @g1)
-; CHECK: ret i32 %1
-define i32 @simple_func1(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = add i32 %sum, %y
-  %l = load i32, i32* @g1, align 4
-  %sum3 = add i32 %sum2, %y
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define i32 @simple_func2(i32 %x, i32 %y)
-; CHECK: %1 = tail call i32 @simple_func1.Tm(i32 %x, i32 %y, ptr @g2)
-; CHECK: ret i32 %1
-define i32 @simple_func2(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = add i32 %sum, %y
-  %l = load i32, i32* @g2, align 4
-  %sum3 = add i32 %sum2, %y
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define internal i32 @simple_func1.Tm(i32 %0, i32 %1, ptr %2)
-; CHECK: %l = load i32, ptr %2
-; CHECK: ret
-
-
-; Merge 3 functions with 3 types of differing instructions: load, store and call.
-
-; CHECK-LABEL: define i32 @func1_of_3(i32 %x)
-; CHECK: %1 = tail call i32 @func1_of_3.Tm(i32 %x, ptr @g1, ptr @g1, ptr @callee1)
-; CHECK: ret i32 %1
-define i32 @func1_of_3(i32 %x) {
-  %l1 = load i32, i32* @g1, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g1, align 4
-  %sum2 = add i32 %sum, %l2
-  store i32 %sum2, i32 *@g1, align 4
-  call void @callee1(i32 %sum2)
-  %sum3 = add i32 %sum2, %l2
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define i32 @func2_of_3(i32 %x)
-; CHECK: %1 = tail call i32 @func1_of_3.Tm(i32 %x, ptr @g2, ptr @g2, ptr @callee2)
-; CHECK: ret i32 %1
-define i32 @func2_of_3(i32 %x) {
-  %l1 = load i32, i32* @g2, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g2, align 4
-  %sum2 = add i32 %sum, %l2
-  store i32 %sum2, i32 *@g2, align 4
-  call void @callee2(i32 %sum2)
-  %sum3 = add i32 %sum2, %l2
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define i32 @func3_of_3(i32 %x)
-; CHECK: %1 = tail call i32 @func1_of_3.Tm(i32 %x, ptr @g3, ptr @g1, ptr @callee3)
-; CHECK: ret i32 %1
-define i32 @func3_of_3(i32 %x) {
-  %l1 = load i32, i32* @g3, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g1, align 4
-  %sum2 = add i32 %sum, %l2
-  store i32 %sum2, i32 *@g3, align 4
-  call void @callee3(i32 %sum2)
-  %sum3 = add i32 %sum2, %l2
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define internal i32 @func1_of_3.Tm(i32 %0, ptr %1, ptr %2, ptr %3)
-; CHECK: %l1 = load i32, ptr %1
-; CHECK: %l2 = load i32, ptr %2
-; CHECK: store i32 %sum2, ptr %1
-; CHECK: call void %3(i32 %sum2)
-; CHECK: ret
-
-declare void @callee1(i32 %x)
-declare void @callee2(i32 %x)
-declare void @callee3(i32 %x)
-
-; Preserve attributes
-
-; CHECK-LABEL: define void @sret_func1(ptr sret(i32) %p, i32 %x, i32 %y)
-; CHECK: tail call void @sret_func1.Tm(ptr sret(i32) %p, i32 %x, i32 %y, ptr @g1)
-; CHECK: ret void
-define void @sret_func1(i32* sret(i32) %p, i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %l = load i32, i32* @g1, align 4
-  %sum2 = add i32 %sum, %l
-  store i32 %sum2, i32* %p
-  ret void
-}
-
-; CHECK-LABEL: define void @sret_func2(ptr sret(i32) %p, i32 %x, i32 %y)
-; CHECK: tail call void @sret_func1.Tm(ptr sret(i32) %p, i32 %x, i32 %y, ptr @g2)
-; CHECK: ret void
-define void @sret_func2(i32* sret(i32) %p, i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %l = load i32, i32* @g2, align 4
-  %sum2 = add i32 %sum, %l
-  store i32 %sum2, i32* %p
-  ret void
-}
-
-; CHECK-LABEL: define internal void @sret_func1.Tm(ptr sret(i32) %0, i32 %1, i32 %2, ptr %3)
-; CHECK: %l = load i32, ptr %3, align 4
-; CHECK: store i32 %sum2, ptr %0
-; CHECK: ret
-
-
-; Don't merge all functions, because we would generate too many parameters.
-; Instead merge those functions which match best.
-
-; CHECK-LABEL: define i32 @func1_merged_with3(i32 %x)
-; CHECK: %1 = tail call i32 @func1_merged_with3.Tm(i32 %x, ptr @g1)
-; CHECK: ret i32 %1
-define i32 @func1_merged_with3(i32 %x) {
-  %l1 = load i32, i32* @g1, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g2, align 4
-  %sum2 = add i32 %sum, %l2
-  %l3 = load i32, i32* @g3, align 4
-  %sum3 = add i32 %sum2, %l2
-  %l4 = load i32, i32* @g4, align 4
-  %sum4 = add i32 %sum3, %l2
-  %l5 = load i32, i32* @g5, align 4
-  %sum5 = add i32 %sum4, %l2
-  ret i32 %sum5
-}
-
-; CHECK-LABEL: define i32 @func2_merged_with4(i32 %x)
-; CHECK: %1 = tail call i32 @func2_merged_with4.Tm(i32 %x, ptr @g2)
-; CHECK: ret i32 %1
-define i32 @func2_merged_with4(i32 %x) {
-  %l1 = load i32, i32* @g2, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g3, align 4
-  %sum2 = add i32 %sum, %l2
-  %l3 = load i32, i32* @g4, align 4
-  %sum3 = add i32 %sum2, %l2
-  %l4 = load i32, i32* @g5, align 4
-  %sum4 = add i32 %sum3, %l2
-  %l5 = load i32, i32* @g1, align 4
-  %sum5 = add i32 %sum4, %l2
-  ret i32 %sum5
-}
-
-; CHECK-LABEL: define i32 @func3_merged_with1(i32 %x)
-; CHECK: %1 = tail call i32 @func1_merged_with3.Tm(i32 %x, ptr @g2)
-; CHECK: ret i32 %1
-define i32 @func3_merged_with1(i32 %x) {
-  %l1 = load i32, i32* @g2, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g2, align 4
-  %sum2 = add i32 %sum, %l2
-  %l3 = load i32, i32* @g3, align 4
-  %sum3 = add i32 %sum2, %l2
-  %l4 = load i32, i32* @g4, align 4
-  %sum4 = add i32 %sum3, %l2
-  %l5 = load i32, i32* @g5, align 4
-  %sum5 = add i32 %sum4, %l2
-  ret i32 %sum5
-}
-
-; CHECK-LABEL: define internal i32 @func1_merged_with3.Tm(i32 %0, ptr %1)
-; CHECK: load i32, ptr %1, align 4
-; CHECK: load i32, ptr @g2, align 4
-; CHECK: load i32, ptr @g3, align 4
-; CHECK: load i32, ptr @g4, align 4
-; CHECK: load i32, ptr @g5, align 4
-; CHECK: ret i32
-
-; CHECK-LABEL: define i32 @func4_merged_with2(i32 %x) {
-; CHECK: %1 = tail call i32 @func2_merged_with4.Tm(i32 %x, ptr @g1)
-; CHECK: ret i32 %1
-define i32 @func4_merged_with2(i32 %x) {
-  %l1 = load i32, i32* @g1, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g3, align 4
-  %sum2 = add i32 %sum, %l2
-  %l3 = load i32, i32* @g4, align 4
-  %sum3 = add i32 %sum2, %l2
-  %l4 = load i32, i32* @g5, align 4
-  %sum4 = add i32 %sum3, %l2
-  %l5 = load i32, i32* @g1, align 4
-  %sum5 = add i32 %sum4, %l2
-  ret i32 %sum5
-}
-
-
-; The same example as above, but we cannot merge func2 with func4, because
-; func4 calls func1 (which is merged with func2 in the first iteration).
-
-declare i32 @get_int(i32 %x)
-
-; CHECK-LABEL: define i32 @Function1_merged_with_3(i32 %x)
-; CHECK: %1 = tail call i32 @Function1_merged_with_3.Tm(i32 %x, ptr @g1)
-; CHECK: ret i32 %1
-define i32 @Function1_merged_with_3(i32 %x) {
-  %l1 = load i32, i32* @g1, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g2, align 4
-  %sum2 = add i32 %sum, %l2
-  %l3 = load i32, i32* @g3, align 4
-  %sum3 = add i32 %sum2, %l2
-  %l4 = load i32, i32* @g4, align 4
-  %sum4 = add i32 %sum3, %l2
-  %l5 = load i32, i32* @g5, align 4
-  %sum5 = add i32 %sum4, %l2
-  %c = call fastcc i32 @get_int(i32 %sum5)
-  ret i32 %c
-}
-
-; CHECK-LABEL: define i32 @Function2_not_merged(i32 %x)
-; CHECK: load
-; CHECK: load
-; CHECK: load
-; CHECK: load
-; CHECK: %c = call fastcc i32 @get_int
-; CHECK: ret i32 %c
-define i32 @Function2_not_merged(i32 %x) {
-  %l1 = load i32, i32* @g2, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g3, align 4
-  %sum2 = add i32 %sum, %l2
-  %l3 = load i32, i32* @g4, align 4
-  %sum3 = add i32 %sum2, %l2
-  %l4 = load i32, i32* @g5, align 4
-  %sum4 = add i32 %sum3, %l2
-  %l5 = load i32, i32* @g1, align 4
-  %sum5 = add i32 %sum4, %l2
-  %c = call fastcc i32 @get_int(i32 %sum5)
-  ret i32 %c
-}
-
-; CHECK-LABEL: define i32 @Function3_merged_with_1(i32 %x)
-; CHECK: %1 = tail call i32 @Function1_merged_with_3.Tm(i32 %x, ptr @g2)
-; CHECK: ret i32 %1
-define i32 @Function3_merged_with_1(i32 %x) {
-  %l1 = load i32, i32* @g2, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g2, align 4
-  %sum2 = add i32 %sum, %l2
-  %l3 = load i32, i32* @g3, align 4
-  %sum3 = add i32 %sum2, %l2
-  %l4 = load i32, i32* @g4, align 4
-  %sum4 = add i32 %sum3, %l2
-  %l5 = load i32, i32* @g5, align 4
-  %sum5 = add i32 %sum4, %l2
-  %c = call fastcc i32 @get_int(i32 %sum5)
-  ret i32 %c
-}
-
-; CHECK-LABEL: define internal i32 @Function1_merged_with_3.Tm(i32 %0, ptr %1)
-; CHECK: load
-; CHECK: load
-; CHECK: load
-; CHECK: load
-; CHECK: %c = call fastcc i32 @get_int
-; CHECK: ret i32 %c
-
-; CHECK-LABEL: define i32 @Function4_not_merged(i32 %x) {
-; CHECK: load
-; CHECK: load
-; CHECK: load
-; CHECK: load
-; CHECK: %1 = call fastcc i32 @Function1_merged_with_3.Tm(i32 %sum5, ptr @g1)
-; CHECK: ret i32 %1
-define i32 @Function4_not_merged(i32 %x) {
-  %l1 = load i32, i32* @g1, align 4
-  %sum = add i32 %x, %l1
-  %l2 = load i32, i32* @g3, align 4
-  %sum2 = add i32 %sum, %l2
-  %l3 = load i32, i32* @g4, align 4
-  %sum3 = add i32 %sum2, %l2
-  %l4 = load i32, i32* @g5, align 4
-  %sum4 = add i32 %sum3, %l2
-  %l5 = load i32, i32* @g1, align 4
-  %sum5 = add i32 %sum4, %l2
-  %c = call fastcc i32 @Function1_merged_with_3(i32 %sum5)
-  ret i32 %c
-}
-
-
-; Test a call chain: caller -> callee1 -> callee2.
-; Functions should be merged in bottom-up order: callee2, callee1, caller.
-; Also check that the calling convention is preserved.
-
-; CHECK-LABEL: define fastcc i32 @callee1_a(i32 %x, i32 %y)
-; CHECK: %1 = tail call fastcc i32 @callee1_a.Tm(i32 %x, i32 %y, ptr @g1)
-; CHECK: ret i32 %1
-define fastcc i32 @callee1_a(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = add i32 %sum, %y
-  %c = call i32 @callee2_a(i32 %sum2, i32 %y)
-  %sum3 = add i32 %sum2, %c
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define fastcc i32 @callee1_b(i32 %x, i32 %y)
-; CHECK: %1 = tail call fastcc i32 @callee1_a.Tm(i32 %x, i32 %y, ptr @g2)
-; CHECK: ret i32 %1
-define fastcc i32 @callee1_b(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = add i32 %sum, %y
-  %c = call i32 @callee2_b(i32 %sum2, i32 %y)
-  %sum3 = add i32 %sum2, %c
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define internal fastcc i32 @callee1_a.Tm(i32 %0, i32 %1, ptr %2)
-; CHECK: call i32 @callee2_a.Tm(i32 %sum2, i32 %1, ptr %2)
-; CHECK: ret
-
-; CHECK-NOT: @callee2_a(
-define internal i32 @callee2_a(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = sub i32 %sum, %y
-  %l = load i32, i32* @g1, align 4
-  %sum3 = add i32 %sum2, %y
-  ret i32 %sum3
-}
-
-; CHECK-NOT: @callee2_b(
-define internal i32 @callee2_b(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = sub i32 %sum, %y
-  %l = load i32, i32* @g2, align 4
-  %sum3 = add i32 %sum2, %y
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define i32 @caller_a(i32 %x, i32 %y)
-; CHECK: %1 = tail call i32 @caller_a.Tm(i32 %x, i32 %y, ptr @g1)
-; CHECK: ret i32 %1
-define i32 @caller_a(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = add i32 %sum, %y
-  %c = call fastcc i32 @callee1_a(i32 %sum2, i32 %y)
-  %sum3 = add i32 %sum2, %c
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define i32 @caller_b(i32 %x, i32 %y)
-; CHECK: %1 = tail call i32 @caller_a.Tm(i32 %x, i32 %y, ptr @g2)
-; CHECK: ret i32 %1
-define i32 @caller_b(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = add i32 %sum, %y
-  %c = call fastcc i32 @callee1_b(i32 %sum2, i32 %y)
-  %sum3 = add i32 %sum2, %c
-  ret i32 %sum3
-}
-
-; CHECK-LABEL: define internal i32 @caller_a.Tm(i32 %0, i32 %1, ptr %2)
-; CHECK: call fastcc i32 @callee1_a.Tm(i32 %sum2, i32 %1, ptr %2)
-; CHECK: ret
-
-
-; Ensure that we do not merge functions that are identical with the
-; exception of the order of the incoming blocks to a phi.
-
-; CHECK-LABEL: define linkonce_odr hidden i1 @first(i2 %0)
-define linkonce_odr hidden i1 @first(i2) {
-entry:
-; CHECK: switch i2
-  switch i2 %0, label %default [
-    i2 0, label %L1
-    i2 1, label %L2
-    i2 -2, label %L3
-  ]
-default:
-  unreachable
-L1:
-  br label %done
-L2:
-  br label %done
-L3:
-  br label %done
-done:
-  %result = phi i1 [ true, %L1 ], [ false, %L2 ], [ false, %L3 ]
-; CHECK: ret i1
-  ret i1 %result
-}
-
-; CHECK-LABEL: define linkonce_odr hidden i1 @second(i2 %0)
-define linkonce_odr hidden i1 @second(i2) {
-entry:
-; CHECK: switch i2
-  switch i2 %0, label %default [
-    i2 0, label %L1
-    i2 1, label %L2
-    i2 -2, label %L3
-  ]
-default:
-  unreachable
-L1:
-  br label %done
-L2:
-  br label %done
-L3:
-  br label %done
-done:
-  %result = phi i1 [ true, %L3 ], [ false, %L2 ], [ false, %L1 ]
-; CHECK: ret i1
-  ret i1 %result
-}
-
-; Check self recursive functions
-
-; CHECK-LABEL: define internal void @recursive1(i32 %x, i32 %y)
-; CHECK: tail call void @recursive1.Tm(i32 %x, i32 %y, ptr @g1, ptr @recursive1)
-; CHECK: ret void
-define internal void @recursive1(i32 %x, i32 %y) {
-  br i1 undef, label %bb1, label %bb2
-
-bb1:
-  %l = load i32, i32* @g1, align 4
-  call void @recursive1(i32 %x, i32 %y)
-  br label %bb2
-
-bb2:
-  ret void
-}
-
-; CHECK-LABEL: define internal void @recursive2(i32 %x, i32 %y)
-; CHECK: tail call void @recursive1.Tm(i32 %x, i32 %y, ptr @g2, ptr @recursive2)
-; CHECK: ret void
-define internal void @recursive2(i32 %x, i32 %y) {
-  br i1 undef, label %bb1, label %bb2
-
-bb1:
-  %l = load i32, i32* @g2, align 4
-  call void @recursive2(i32 %x, i32 %y)
-  br label %bb2
-
-bb2:
-  ret void
-}
-; CHECK-LABEL: define internal void @recursive1.Tm(i32 %0, i32 %1, ptr %2, ptr %3)
-; CHECK: load i32, ptr %2
-; CHECK: call void %3(i32 %0, i32 %1)
-; CHECK: ret void
-
-
-; CHECK-LABEL: define internal void @another_recursive_func(i32 %x)
-; CHECK: tail call void @another_recursive_func.Tm(i32 %x, ptr @g1, ptr @another_recursive_func)
-; CHECK: ret void
-define internal void @another_recursive_func(i32 %x) {
-  br i1 undef, label %bb1, label %bb2
-
-bb1:
-  store i32 %x, i32 *@g1, align 4
-  call void @another_recursive_func(i32 %x)
-  br label %bb2
-
-bb2:
-  ret void
-}
-; CHECK-NOT: @not_really_recursive(
-
-; CHECK-LABEL: define internal void @another_recursive_func.Tm(i32 %0, ptr %1, ptr %2)
-; CHECK: store i32 %0, ptr %1
-; CHECK: call void %2(i32 %0)
-; CHECK: ret void
-define internal void @not_really_recursive(i32 %x) {
-  br i1 undef, label %bb1, label %bb2
-
-bb1:
-  store i32 %x, i32 *@g2, align 4
-  call void @callee1(i32 %x)
-  br label %bb2
-
-bb2:
-  ret void
-}
-; CHECK-NOT: @not_really_recursive(
-
-; CHECK-LABEL: define void @call_recursive_funcs(i32 %x)
-; CHECK: call void @recursive1(i32 %x, i32 %x)
-; CHECK: call void @recursive2(i32 %x, i32 %x)
-; CHECK: call void @another_recursive_func(i32 %x)
-; CHECK: call void @another_recursive_func.Tm(i32 %x, ptr @g2, ptr @callee1)
-; CHECK: ret void
-define void @call_recursive_funcs(i32 %x) {
-  call void @recursive1(i32 %x, i32 %x)
-  call void @recursive2(i32 %x, i32 %x)
-  call void @another_recursive_func(i32 %x)
-  call void @not_really_recursive(i32 %x)
-  ret void
-}
-
-; Ensure that we do not merge functions which make use of distinct dtrace
-; probes. Each call to a dtrace probe must resolve to a unique patchpoint.
-
-declare void @"__dtrace_probe$Apple$Probe1$v1$696e74"(i32) local_unnamed_addr
-
-; CHECK-LABEL: define i32 @use_dtrace_probe1
-; CHECK: call void @"__dtrace_probe$Apple$Probe1$v1$696e74"
-define i32 @use_dtrace_probe1(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = add i32 %sum, %y
-  %l = load i32, i32* @g1, align 4
-  %sum3 = add i32 %sum2, %y
-  tail call void @"__dtrace_probe$Apple$Probe1$v1$696e74"(i32 undef)
-  ret i32 %sum3
-}
-
-declare void @"__dtrace_probe$Apple$Probe2$v1$696e74"(i32) local_unnamed_addr
-
-; CHECK-LABEL: define i32 @use_dtrace_probe2
-; CHECK: call void @"__dtrace_probe$Apple$Probe2$v1$696e74"
-define i32 @use_dtrace_probe2(i32 %x, i32 %y) {
-  %sum = add i32 %x, %y
-  %sum2 = add i32 %sum, %y
-  %l = load i32, i32* @g2, align 4
-  %sum3 = add i32 %sum2, %y
-  tail call void @"__dtrace_probe$Apple$Probe2$v1$696e74"(i32 undef)
-  ret i32 %sum3
-}
diff --git a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll
deleted file mode 100644
index c5c8b898c046e51..000000000000000
--- a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll
+++ /dev/null
@@ -1,190 +0,0 @@
-; RUN: opt -S -enable-aggressive-mergefunc-ignoringconst -passes=mergefunc-ignoring-const %s -o - | FileCheck %s
-
-%4 = type opaque
-%10 = type opaque
-%"struct.SearchSpec::State" = type { %4* }
-%"struct.PointerList" = type { i8*, i8*, i8*, i8*, i8* }
-%"struct.DynamicCallback" = type { %10* }
-
-; CHECK: define ptr @invoke_foo(ptr nocapture readonly %.block_descriptor, ptr %stateWrapper)
-; CHECK: %1 = {{.*}}call ptr @invoke_foo.Tm
-; CHECK: define ptr @invoke_bar(ptr nocapture readonly %.block_descriptor, ptr %stateWrapper) {
-; CHECK: %1 = {{.*}}call ptr @invoke_foo.Tm
-; CHECK: define {{.*}}.Tm(ptr nocapture readonly %0, ptr %1, ptr %2, ptr %3)
-
-; Function Attrs: minsize optsize ssp uwtable
-define i8* @invoke_foo(i8* nocapture readonly %.block_descriptor, i8* %stateWrapper) #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-entry:
-  %state = alloca %"struct.SearchSpec::State", align 8
-  %agg.tmp = alloca %"struct.PointerList", align 8
-  %0 = tail call i8* @llvm.objc.retain(i8* %stateWrapper) #2
-  %1 = bitcast %"struct.SearchSpec::State"* %state to i8*
-  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #2
-  %2 = getelementptr inbounds i8, i8* %stateWrapper, i64 16
-  %3 = bitcast i8* %2 to %"struct.SearchSpec::State"* (i8*)**
-  %4 = load %"struct.SearchSpec::State"* (i8*)*, %"struct.SearchSpec::State"* (i8*)** %3, align 8
-  %call.i4 = invoke nonnull align 8 dereferenceable(8) %"struct.SearchSpec::State"* %4(i8* nonnull %stateWrapper) #31
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  %initialText.i.i = getelementptr inbounds %"struct.SearchSpec::State", %"struct.SearchSpec::State"* %state, i64 0, i32 0
-  %initialText2.i.i = getelementptr inbounds %"struct.SearchSpec::State", %"struct.SearchSpec::State"* %call.i4, i64 0, i32 0
-  %5 = load %4*, %4** %initialText2.i.i, align 8
-  %6 = bitcast %4* %5 to i8*
-  %7 = tail call i8* @llvm.objc.retain(i8* %6) #2
-  store %4* %5, %4** %initialText.i.i, align 8
-  %block.capture.addr = getelementptr inbounds i8, i8* %.block_descriptor, i64 32
-  %8 = bitcast i8* %block.capture.addr to i8**
-  %9 = load i8*, i8** %8, align 8
-  invoke void @callee2(%"struct.PointerList"* nonnull sret(%"struct.PointerList") align 8 %agg.tmp, i8* %9, i1 zeroext false) #31
-          to label %invoke.cont2 unwind label %lpad1
-
-invoke.cont2:                                     ; preds = %invoke.cont
-  %block.capture.addr3 = getelementptr inbounds i8, i8* %.block_descriptor, i64 40
-  %10 = bitcast i8* %block.capture.addr3 to %4**
-  %agg.tmp6.sroa.3.0..sroa_idx12 = getelementptr inbounds %"struct.PointerList", %"struct.PointerList"* %agg.tmp, i64 0, i32 3
-  %agg.tmp6.sroa.3.0.copyload = load i8*, i8** %agg.tmp6.sroa.3.0..sroa_idx12, align 8
-  %11 = load %4*, %4** %10, align 8
-  invoke void @callee1(%"struct.SearchSpec::State"* nonnull align 8 dereferenceable(8) %state, %4* %11) #31
-          to label %invoke.cont4 unwind label %lpad.i
-
-lpad.i:                                           ; preds = %invoke.cont2
-  %12 = landingpad { i8*, i32 }
-          cleanup
-  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
-  %.phi.trans.insert = bitcast %"struct.SearchSpec::State"* %state to i8**
-  %.pre = load i8*, i8** %.phi.trans.insert, align 8
-  br label %lpad1.body
-
-invoke.cont4:                                     ; preds = %invoke.cont2
-  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
-  %13 = load %4*, %4** %initialText.i.i, align 8
-  store %4* null, %4** %initialText.i.i, align 8
-  %call78 = call fastcc i8* @callee3(%4* %13) #31 [ "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  call void (...) @llvm.objc.clang.arc.noop.use(i8* %call78) #2
-  %14 = bitcast %"struct.SearchSpec::State"* %state to i8**
-  %15 = load i8*, i8** %14, align 8
-  call void @llvm.objc.release(i8* %15) #2
-  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
-  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
-  %16 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %call78) #2
-  ret i8* %call78
-
-lpad:                                             ; preds = %entry
-  %17 = landingpad { i8*, i32 }
-          cleanup
-  br label %ehcleanup
-
-lpad1:                                            ; preds = %invoke.cont
-  %18 = landingpad { i8*, i32 }
-          cleanup
-  br label %lpad1.body
-
-lpad1.body:                                       ; preds = %lpad1, %lpad.i
-  %19 = phi i8* [ %6, %lpad1 ], [ %.pre, %lpad.i ]
-  %eh.lpad-body = phi { i8*, i32 } [ %18, %lpad1 ], [ %12, %lpad.i ]
-  call void @llvm.objc.release(i8* %19) #2
-  br label %ehcleanup
-
-ehcleanup:                                        ; preds = %lpad1.body, %lpad
-  %.pn = phi { i8*, i32 } [ %eh.lpad-body, %lpad1.body ], [ %17, %lpad ]
-  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
-  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
-  resume { i8*, i32 } %.pn
-}
-
-; Function Attrs: minsize optsize ssp uwtable
-define i8* @invoke_bar(i8* nocapture readonly %.block_descriptor, i8* %stateWrapper) #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-entry:
-  %state = alloca %"struct.DynamicCallback", align 8
-  %agg.tmp = alloca %"struct.PointerList", align 8
-  %0 = tail call i8* @llvm.objc.retain(i8* %stateWrapper) #2
-  %1 = bitcast %"struct.DynamicCallback"* %state to i8*
-  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #2
-  %2 = getelementptr inbounds i8, i8* %stateWrapper, i64 16
-  %3 = bitcast i8* %2 to %"struct.DynamicCallback"* (i8*)**
-  %4 = load %"struct.DynamicCallback"* (i8*)*, %"struct.DynamicCallback"* (i8*)** %3, align 8
-  %call.i4 = invoke nonnull align 8 dereferenceable(8) %"struct.DynamicCallback"* %4(i8* nonnull %stateWrapper) #31
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  %call.i.i = getelementptr inbounds %"struct.DynamicCallback", %"struct.DynamicCallback"* %state, i64 0, i32 0
-  %call2.i.i = getelementptr inbounds %"struct.DynamicCallback", %"struct.DynamicCallback"* %call.i4, i64 0, i32 0
-  %5 = load %10*, %10** %call2.i.i, align 8
-  %6 = bitcast %10* %5 to i8*
-  %7 = tail call i8* @llvm.objc.retain(i8* %6) #2
-  store %10* %5, %10** %call.i.i, align 8
-  %block.capture.addr = getelementptr inbounds i8, i8* %.block_descriptor, i64 32
-  %8 = bitcast i8* %block.capture.addr to i8**
-  %9 = load i8*, i8** %8, align 8
-  invoke void @callee2(%"struct.PointerList"* nonnull sret(%"struct.PointerList") align 8 %agg.tmp, i8* %9, i1 zeroext false) #31
-          to label %invoke.cont2 unwind label %lpad1
-
-invoke.cont2:                                     ; preds = %invoke.cont
-  %block.capture.addr3 = getelementptr inbounds i8, i8* %.block_descriptor, i64 40
-  %10 = bitcast i8* %block.capture.addr3 to %10**
-  %agg.tmp6.sroa.3.0..sroa_idx12 = getelementptr inbounds %"struct.PointerList", %"struct.PointerList"* %agg.tmp, i64 0, i32 3
-  %agg.tmp6.sroa.3.0.copyload = load i8*, i8** %agg.tmp6.sroa.3.0..sroa_idx12, align 8
-  %11 = load %10*, %10** %10, align 8
-  invoke void @callee5(%"struct.DynamicCallback"* nonnull align 8 dereferenceable(8) %state, %10* %11) #31
-          to label %invoke.cont4 unwind label %lpad.i
-
-lpad.i:                                           ; preds = %invoke.cont2
-  %12 = landingpad { i8*, i32 }
-          cleanup
-  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
-  %.phi.trans.insert = bitcast %"struct.DynamicCallback"* %state to i8**
-  %.pre = load i8*, i8** %.phi.trans.insert, align 8
-  br label %lpad1.body
-
-invoke.cont4:                                     ; preds = %invoke.cont2
-  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
-  %13 = load %10*, %10** %call.i.i, align 8
-  store %10* null, %10** %call.i.i, align 8
-  %call78 = call fastcc i8* @callee4(%10* %13) #31 [ "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  call void (...) @llvm.objc.clang.arc.noop.use(i8* %call78) #2
-  %14 = bitcast %"struct.DynamicCallback"* %state to i8**
-  %15 = load i8*, i8** %14, align 8
-  call void @llvm.objc.release(i8* %15) #2
-  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
-  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
-  %16 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %call78) #2
-  ret i8* %call78
-
-lpad:                                             ; preds = %entry
-  %17 = landingpad { i8*, i32 }
-          cleanup
-  br label %ehcleanup
-
-lpad1:                                            ; preds = %invoke.cont
-  %18 = landingpad { i8*, i32 }
-          cleanup
-  br label %lpad1.body
-
-lpad1.body:                                       ; preds = %lpad1, %lpad.i
-  %19 = phi i8* [ %6, %lpad1 ], [ %.pre, %lpad.i ]
-  %eh.lpad-body = phi { i8*, i32 } [ %18, %lpad1 ], [ %12, %lpad.i ]
-  call void @llvm.objc.release(i8* %19) #2
-  br label %ehcleanup
-
-ehcleanup:                                        ; preds = %lpad1.body, %lpad
-  %.pn = phi { i8*, i32 } [ %eh.lpad-body, %lpad1.body ], [ %17, %lpad ]
-  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
-  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
-  resume { i8*, i32 } %.pn
-}
-declare void @callee1(%"struct.SearchSpec::State"* nonnull align 8 dereferenceable(8), %4*)
-declare void @callee2(%"struct.PointerList"* sret(%"struct.PointerList") align 8, i8*, i1 zeroext)
-declare i8* @callee3(%4* %state.coerce)
-declare i8* @callee4(%10* %state.coerce)
-declare void @callee5(%"struct.DynamicCallback"* nonnull align 8 dereferenceable(8), %10*)
-declare i32 @__gxx_personality_v0(...)
-declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
-declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
-declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
-declare void @llvm.objc.clang.arc.noop.use(...)
-declare void @llvm.objc.release(i8*)
-declare i8* @llvm.objc.retain(i8*)
-declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
-
-!1 = !{}

>From 52491c99fa8b30a558749da231fed7544159edca Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann at gmail.com>
Date: Fri, 3 Nov 2023 21:21:25 +0100
Subject: [PATCH 74/76] [MLIR][LLVM] Remove typed pointer remnants from
 integration tests (#71208)

This commit removes all LLVM dialect typed pointers from the integration
tests. Typed pointers have been deprecated for a while now and it's
planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 .../test/Integration/Dialect/SparseTensor/CPU/block.mlir | 2 +-
 .../Dialect/SparseTensor/CPU/dense_output.mlir           | 2 +-
 .../Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir  | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_flatten.mlir         | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_matvec.mlir          | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_mttkrp.mlir          | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_out_simple.mlir      | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir  | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir      | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_spmm.mlir            | 2 +-
 .../Integration/Dialect/SparseTensor/CPU/sparse_sum.mlir | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir        | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_sum_c32.mlir         | 2 +-
 .../Dialect/SparseTensor/CPU/sparse_sum_f16.mlir         | 2 +-
 .../SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir | 2 +-
 .../Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir  | 2 +-
 .../Dialect/SparseTensor/python/test_output.py           | 4 ++--
 .../Vector/CPU/ArmSME/load-store-128-bit-tile.mlir       | 8 ++++----
 .../CPU/X86Vector/test-inline-asm-vector-avx512.mlir     | 9 ++++-----
 .../GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir       | 2 +-
 .../GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir        | 2 +-
 21 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
index e1cdc9ed6ba3d41..b77c1b42baf7ec6 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
@@ -25,7 +25,7 @@
 // REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false
 // R_UN: %{compile} | env %{env} %{run} | FileCheck %s
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #BSR = #sparse_tensor.encoding<{
   map = (i, j) ->
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir
index 4ef8b29ee4e1a84..f11d396dc6f8f7d 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir
@@ -31,7 +31,7 @@
 // Do the same run, but now with direct IR generation and VLA vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | env %{env} %{run_sve} | FileCheck %s %}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #DenseMatrix = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : dense, d1 : dense)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
index 773c34e1f3dabca..8c81e9df6a0e41d 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_ds.mlir
@@ -25,7 +25,7 @@
 // REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false
 // R_UN: %{compile} | env %{env} %{run} | FileCheck %s
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #CSR = #sparse_tensor.encoding<{
   map = (i, j) -> ( i : dense, j : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_flatten.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_flatten.mlir
index 60f2e22ab4a8200..837ea4038cac8b8 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_flatten.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_flatten.mlir
@@ -31,7 +31,7 @@
 // Do the same run, but now with direct IR generation and VLA vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | env %{env} %{run_sve} | FileCheck %s %}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SparseTensor = #sparse_tensor.encoding<{
   // Note that any dimToLvl permutation should give the same results
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
index 19648b25fd7c15f..e2d5e2d976415b7 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
@@ -40,7 +40,7 @@
 // vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | env %{env} %{run_sve} | FileCheck %s %}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SparseMatrix = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : dense, d1 : compressed),
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_mttkrp.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_mttkrp.mlir
index 306b88149e736dc..ed4dc73a43226f5 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_mttkrp.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_mttkrp.mlir
@@ -32,7 +32,7 @@
 // vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | env %{env} %{run_sve} | FileCheck %s %}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SparseTensor = #sparse_tensor.encoding<{
   map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir
index b466cf242da52a6..911785030ba4294 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir
@@ -28,7 +28,7 @@
 // vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | env %{env} %{run_sve} | FileCheck %s %}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #DCSR = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed, d1 : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir
index b1249c73806b16f..2b134d94a9dcf3a 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_matmul.mlir
@@ -32,7 +32,7 @@
 // vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | env %{env} %{run_sve} | FileCheck %s %}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SparseMatrix = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed, d1 : compressed),
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir
index b789450b4f88bba..e7690adac534d89 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir
@@ -32,7 +32,7 @@
 // Do the same run, but now with  VLA vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | env %{env} %{run_sve} | FileCheck %s %}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SortedCOO = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir
index e8a9ea6e2c5a775..5459293383015ed 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_spmm.mlir
@@ -31,7 +31,7 @@
 // Do the same run, but now with  VLA vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | env %{env} %{run_sve} | FileCheck %s %}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SparseMatrix = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : dense, d1 : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum.mlir
index 99b596f869ec09b..89383d7ec84eb6f 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum.mlir
@@ -34,7 +34,7 @@
 // TODO: The test currently only operates on the triangular part of the
 // symmetric matrix.
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SparseMatrix = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed, d1 : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir
index cbedd2300b0eee3..d5e519efb916d1b 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_bf16.mlir
@@ -31,7 +31,7 @@
 
 // UNSUPPORTED: target=aarch64{{.*}}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SparseMatrix = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed, d1 : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_c32.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_c32.mlir
index 13f4f221dff05e5..4a69125394c0b54 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_c32.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_c32.mlir
@@ -34,7 +34,7 @@
 // TODO: The test currently only operates on the triangular part of the
 // symmetric matrix.
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SparseMatrix = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed, d1 : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_f16.mlir
index dfb23d6afc64bce..f6f55b7ab2d7d4d 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sum_f16.mlir
@@ -30,7 +30,7 @@
 // Do the same run, but now with  VLA vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #SparseMatrix = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed, d1 : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
index ac5c0f8bead0773..e79696ac4c047ca 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
@@ -26,7 +26,7 @@
 // RUNNOT: %{compile} enable-runtime-library=false gpu-data-transfer-strategy=zero-copy" | %{run}
 //
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #CSR = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : dense, d1 : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
index 54408d629ec22ec..c1062dd4ee3e938 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
@@ -21,7 +21,7 @@
 // R_UN:  %{compile} enable-runtime-library=false" | %{run}
 //
 
-!Filename = !llvm.ptr<i8>
+!Filename = !llvm.ptr
 
 #CSR = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : dense, d1 : compressed)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
index 5a8c92f7cd21fc5..c9efadb60480c54 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
@@ -19,11 +19,11 @@
 def boilerplate(attr: st.EncodingAttr):
     """Returns boilerplate main method."""
     return f"""
-func.func @main(%p : !llvm.ptr<i8>) -> () attributes {{ llvm.emit_c_interface }} {{
+func.func @main(%p : !llvm.ptr) -> () attributes {{ llvm.emit_c_interface }} {{
   %d = arith.constant sparse<[[0, 0], [1, 1], [0, 9], [9, 0], [4, 4]],
                              [1.0, 2.0, 3.0, 4.0, 5.0]> : tensor<10x10xf64>
   %a = sparse_tensor.convert %d : tensor<10x10xf64> to tensor<10x10xf64, {attr}>
-  sparse_tensor.out %a, %p : tensor<10x10xf64, {attr}>, !llvm.ptr<i8>
+  sparse_tensor.out %a, %p : tensor<10x10xf64, {attr}>, !llvm.ptr
   return
 }}
 """
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
index 78f1bede5a6a529..5a9fccdc31640c0 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
@@ -37,10 +37,10 @@ func.func @vector_copy_i128(%src: memref<?x?xi128>, %dst: memref<?x?xi128>) {
 }
 
 func.func @test_load_store_zaq0() {
-  %init_a_str = llvm.mlir.addressof @init_tile_a : !llvm.ptr<array<17 x i8>>
-  %init_b_str = llvm.mlir.addressof @init_tile_b : !llvm.ptr<array<17 x i8>>
-  %final_a_str = llvm.mlir.addressof @final_tile_a : !llvm.ptr<array<17 x i8>>
-  %final_b_str = llvm.mlir.addressof @final_tile_b : !llvm.ptr<array<17 x i8>>
+  %init_a_str = llvm.mlir.addressof @init_tile_a : !llvm.ptr
+  %init_b_str = llvm.mlir.addressof @init_tile_b : !llvm.ptr
+  %final_a_str = llvm.mlir.addressof @final_tile_a : !llvm.ptr
+  %final_b_str = llvm.mlir.addressof @final_tile_b : !llvm.ptr
 
   %c0 = arith.constant 0 : index
   %min_elts_q = arith.constant 1 : index
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-inline-asm-vector-avx512.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-inline-asm-vector-avx512.mlir
index 4b57a2924883736..828e498543a9f20 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-inline-asm-vector-avx512.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/test-inline-asm-vector-avx512.mlir
@@ -13,18 +13,17 @@ module {
   llvm.func @entry() -> i32 {
     %c0 = llvm.mlir.constant(0 : index) : i64
 
-    %1 = llvm.mlir.addressof @const16 : !llvm.ptr<array<16 x i32>>
+    %1 = llvm.mlir.addressof @const16 : !llvm.ptr
     %ptr = llvm.getelementptr %1[%c0, %c0]
-      : (!llvm.ptr<array<16 x i32>>, i64, i64) -> !llvm.ptr<i32>
-    %ptr2 = llvm.bitcast %ptr :  !llvm.ptr<i32> to !llvm.ptr<vector<16xi32>>
+      : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<16 x i32>
 
     // operand_attrs of *m operands need to be piped through to LLVM for
     // verification to pass.
     %v = llvm.inline_asm
         asm_dialect = intel
         operand_attrs = [{ elementtype = vector<16xi32> }]
-        "vmovdqu32 $0, $1", "=x,*m" %ptr2
-      : (!llvm.ptr<vector<16xi32>>) -> vector<16xi32>
+        "vmovdqu32 $0, $1", "=x,*m" %ptr
+      : (!llvm.ptr) -> vector<16xi32>
 
     // CHECK: 0
     %v0 = vector.extract %v[0]: i32 from vector<16xi32>
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index 2ad39405cc06f4b..19f88306050afb8 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -31,7 +31,7 @@
 module @mymod {
   func.func private @printMemrefF32(memref<*xf32>)
   memref.global "private" @bufferLhsGlobal : !shmemlhs
-  llvm.func @printf(!llvm.ptr<i8>, ...) -> i32
+  llvm.func @printf(!llvm.ptr, ...) -> i32
   func.func @main() {
     %c8192 = arith.constant 8192 : index
     %c-1_i32 = arith.constant -1 : i32
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index 13b9c48dabe85d7..4ce8db0f2cba212 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -41,7 +41,7 @@ module @mymod {
   func.func private @printMemrefF32(memref<*xf32>)
   memref.global "private" @bufferLhsGlobal : !shmemlhs
   memref.global "private" @bufferRhsGlobal : !shmemrhs
-  llvm.func @printf(!llvm.ptr<i8>, ...) -> i32
+  llvm.func @printf(!llvm.ptr, ...) -> i32
   func.func @main() {
     %c32768 = arith.constant 32768 : index
     %c-1_i32 = arith.constant -1 : i32

>From fcc26bad82e190e1ec09bc6fe76ea320f5ffeeeb Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann at gmail.com>
Date: Fri, 3 Nov 2023 21:21:45 +0100
Subject: [PATCH 75/76] [MLIR][LLVM] Remove typed pointer remnants from target
 tests (#71210)

This commit removes all LLVM dialect typed pointers from the target
tests. Typed pointers have been deprecated for a while now and it's
planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/test/Target/LLVMIR/amx.mlir              |   4 +-
 mlir/test/Target/LLVMIR/arm-sme.mlir          |  94 ++---
 mlir/test/Target/LLVMIR/arm-sve.mlir          |  42 +-
 mlir/test/Target/LLVMIR/llvmir-debug.mlir     |   4 +-
 .../test/Target/LLVMIR/llvmir-intrinsics.mlir |   8 +-
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir   |   4 +-
 mlir/test/Target/LLVMIR/llvmir-types.mlir     |  44 +-
 mlir/test/Target/LLVMIR/llvmir.mlir           | 381 +++++++++---------
 mlir/test/Target/LLVMIR/nvvmir.mlir           |  36 +-
 mlir/test/Target/LLVMIR/openacc-llvm.mlir     |  40 +-
 mlir/test/Target/LLVMIR/openmp-nested.mlir    |   2 +-
 11 files changed, 307 insertions(+), 352 deletions(-)

diff --git a/mlir/test/Target/LLVMIR/amx.mlir b/mlir/test/Target/LLVMIR/amx.mlir
index 4df349b17b0a024..0281dfcd6ad69fe 100644
--- a/mlir/test/Target/LLVMIR/amx.mlir
+++ b/mlir/test/Target/LLVMIR/amx.mlir
@@ -3,11 +3,11 @@
 // CHECK-LABEL: define void @target(ptr %0)
 // CHECK: %[[c:.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 16)
 // CHECK: call void @llvm.x86.tilestored64.internal(i16 16, i16 16, ptr %0, i64 32, x86_amx %[[c]]
-llvm.func @target(%ptr: !llvm.ptr<i8>) {
+llvm.func @target(%ptr: !llvm.ptr) {
   %c = llvm.mlir.constant(16 : i16) : i16
   %s = llvm.mlir.constant(32 : i64) : i64
   %0 = "amx.tilezero"(%c, %c) : (i16, i16) -> !llvm.array<16 x vector<16xbf16>>
-  "amx.tilestored64"(%c, %c, %ptr, %s, %0) : (i16, i16, !llvm.ptr<i8>, i64, !llvm.array<16 x vector<16xbf16>>) -> ()
+  "amx.tilestored64"(%c, %c, %ptr, %s, %0) : (i16, i16, !llvm.ptr, i64, !llvm.array<16 x vector<16xbf16>>) -> ()
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/arm-sme.mlir b/mlir/test/Target/LLVMIR/arm-sme.mlir
index 628d7ba4b649e51..27c94d9aeac8bf4 100644
--- a/mlir/test/Target/LLVMIR/arm-sme.mlir
+++ b/mlir/test/Target/LLVMIR/arm-sme.mlir
@@ -138,42 +138,38 @@ llvm.func @arm_sme_load(%nxv1i1  : vector<[1]xi1>,
                         %nxv4i1  : vector<[4]xi1>,
                         %nxv8i1  : vector<[8]xi1>,
                         %nxv16i1 : vector<[16]xi1>,
-                        %p8      : !llvm.ptr<i8>,
-                        %p16     : !llvm.ptr<i16>,
-                        %p32     : !llvm.ptr<i32>,
-                        %p64     : !llvm.ptr<i64>,
-                        %p128    : !llvm.ptr<i128>) {
+                        %ptr    : !llvm.ptr) {
   %c0 = llvm.mlir.constant(0 : index) : i32
   // CHECK: call void @llvm.aarch64.sme.ld1q.horiz
-  "arm_sme.intr.ld1q.horiz"(%nxv1i1, %p128, %c0, %c0) :
-              (vector<[1]xi1>, !llvm.ptr<i128>, i32, i32) -> ()
+  "arm_sme.intr.ld1q.horiz"(%nxv1i1, %ptr, %c0, %c0) :
+              (vector<[1]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.ld1d.horiz
-  "arm_sme.intr.ld1d.horiz"(%nxv2i1, %p64, %c0, %c0) :
-              (vector<[2]xi1>, !llvm.ptr<i64>, i32, i32) -> ()
+  "arm_sme.intr.ld1d.horiz"(%nxv2i1, %ptr, %c0, %c0) :
+              (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.ld1w.horiz
-  "arm_sme.intr.ld1w.horiz"(%nxv4i1, %p32, %c0, %c0) :
-              (vector<[4]xi1>, !llvm.ptr<i32>, i32, i32) -> ()
+  "arm_sme.intr.ld1w.horiz"(%nxv4i1, %ptr, %c0, %c0) :
+              (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.ld1h.horiz
-  "arm_sme.intr.ld1h.horiz"(%nxv8i1, %p16, %c0, %c0) :
-              (vector<[8]xi1>, !llvm.ptr<i16>, i32, i32) -> ()
+  "arm_sme.intr.ld1h.horiz"(%nxv8i1, %ptr, %c0, %c0) :
+              (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.ld1b.horiz
-  "arm_sme.intr.ld1b.horiz"(%nxv16i1, %p8, %c0, %c0) :
-              (vector<[16]xi1>, !llvm.ptr<i8>, i32, i32) -> ()
+  "arm_sme.intr.ld1b.horiz"(%nxv16i1, %ptr, %c0, %c0) :
+              (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.ld1q.vert
-  "arm_sme.intr.ld1q.vert"(%nxv1i1, %p128, %c0, %c0) :
-              (vector<[1]xi1>, !llvm.ptr<i128>, i32, i32) -> ()
+  "arm_sme.intr.ld1q.vert"(%nxv1i1, %ptr, %c0, %c0) :
+              (vector<[1]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.ld1d.vert
-  "arm_sme.intr.ld1d.vert"(%nxv2i1, %p64, %c0, %c0) :
-              (vector<[2]xi1>, !llvm.ptr<i64>, i32, i32) -> ()
+  "arm_sme.intr.ld1d.vert"(%nxv2i1, %ptr, %c0, %c0) :
+              (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.ld1w.vert
-  "arm_sme.intr.ld1w.vert"(%nxv4i1, %p32, %c0, %c0) :
-              (vector<[4]xi1>, !llvm.ptr<i32>, i32, i32) -> ()
+  "arm_sme.intr.ld1w.vert"(%nxv4i1, %ptr, %c0, %c0) :
+              (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.ld1h.vert
-  "arm_sme.intr.ld1h.vert"(%nxv8i1, %p16, %c0, %c0) :
-              (vector<[8]xi1>, !llvm.ptr<i16>, i32, i32) -> ()
+  "arm_sme.intr.ld1h.vert"(%nxv8i1, %ptr, %c0, %c0) :
+              (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.ld1b.vert
-  "arm_sme.intr.ld1b.vert"(%nxv16i1, %p8, %c0, %c0) :
-              (vector<[16]xi1>, !llvm.ptr<i8>, i32, i32) -> ()
+  "arm_sme.intr.ld1b.vert"(%nxv16i1, %ptr, %c0, %c0) :
+              (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
   llvm.return
 }
 
@@ -185,44 +181,40 @@ llvm.func @arm_sme_store(%nxv1i1  : vector<[1]xi1>,
                          %nxv4i1  : vector<[4]xi1>,
                          %nxv8i1  : vector<[8]xi1>,
                          %nxv16i1 : vector<[16]xi1>,
-                         %p8      : !llvm.ptr<i8>,
-                         %p16     : !llvm.ptr<i16>,
-                         %p32     : !llvm.ptr<i32>,
-                         %p64     : !llvm.ptr<i64>,
-                         %p128    : !llvm.ptr<i128>) {
+                         %ptr    : !llvm.ptr) {
   %c0 = llvm.mlir.constant(0 : index) : i32
   // CHECK: call void @llvm.aarch64.sme.st1q.horiz
-  "arm_sme.intr.st1q.horiz"(%nxv1i1, %p128, %c0, %c0) :
-              (vector<[1]xi1>, !llvm.ptr<i128>, i32, i32) -> ()
+  "arm_sme.intr.st1q.horiz"(%nxv1i1, %ptr, %c0, %c0) :
+              (vector<[1]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.st1d.horiz
-  "arm_sme.intr.st1d.horiz"(%nxv2i1, %p64, %c0, %c0) :
-              (vector<[2]xi1>, !llvm.ptr<i64>, i32, i32) -> ()
+  "arm_sme.intr.st1d.horiz"(%nxv2i1, %ptr, %c0, %c0) :
+              (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.st1w.horiz
-  "arm_sme.intr.st1w.horiz"(%nxv4i1, %p32, %c0, %c0) :
-              (vector<[4]xi1>, !llvm.ptr<i32>, i32, i32) -> ()
+  "arm_sme.intr.st1w.horiz"(%nxv4i1, %ptr, %c0, %c0) :
+              (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.st1h.horiz
-  "arm_sme.intr.st1h.horiz"(%nxv8i1, %p16, %c0, %c0) :
-              (vector<[8]xi1>, !llvm.ptr<i16>, i32, i32) -> ()
+  "arm_sme.intr.st1h.horiz"(%nxv8i1, %ptr, %c0, %c0) :
+              (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.st1b.horiz
-  "arm_sme.intr.st1b.horiz"(%nxv16i1, %p8, %c0, %c0) :
-              (vector<[16]xi1>, !llvm.ptr<i8>, i32, i32) -> ()
+  "arm_sme.intr.st1b.horiz"(%nxv16i1, %ptr, %c0, %c0) :
+              (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.st1q.vert
-  "arm_sme.intr.st1q.vert"(%nxv1i1, %p128, %c0, %c0) :
-              (vector<[1]xi1>, !llvm.ptr<i128>, i32, i32) -> ()
+  "arm_sme.intr.st1q.vert"(%nxv1i1, %ptr, %c0, %c0) :
+              (vector<[1]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.st1d.vert
-  "arm_sme.intr.st1d.vert"(%nxv2i1, %p64, %c0, %c0) :
-              (vector<[2]xi1>, !llvm.ptr<i64>, i32, i32) -> ()
+  "arm_sme.intr.st1d.vert"(%nxv2i1, %ptr, %c0, %c0) :
+              (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.st1w.vert
-  "arm_sme.intr.st1w.vert"(%nxv4i1, %p32, %c0, %c0) :
-              (vector<[4]xi1>, !llvm.ptr<i32>, i32, i32) -> ()
+  "arm_sme.intr.st1w.vert"(%nxv4i1, %ptr, %c0, %c0) :
+              (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.st1h.vert
-  "arm_sme.intr.st1h.vert"(%nxv8i1, %p16, %c0, %c0) :
-              (vector<[8]xi1>, !llvm.ptr<i16>, i32, i32) -> ()
+  "arm_sme.intr.st1h.vert"(%nxv8i1, %ptr, %c0, %c0) :
+              (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.st1b.vert
-  "arm_sme.intr.st1b.vert"(%nxv16i1, %p8, %c0, %c0) :
-              (vector<[16]xi1>, !llvm.ptr<i8>, i32, i32) -> ()
+  "arm_sme.intr.st1b.vert"(%nxv16i1, %ptr, %c0, %c0) :
+              (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.str
-  "arm_sme.intr.str"(%c0, %p8) : (i32, !llvm.ptr<i8>) -> ()
+  "arm_sme.intr.str"(%c0, %ptr) : (i32, !llvm.ptr) -> ()
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/arm-sve.mlir b/mlir/test/Target/LLVMIR/arm-sve.mlir
index 172a2f7d12d440e..b63d3f06515690a 100644
--- a/mlir/test/Target/LLVMIR/arm-sve.mlir
+++ b/mlir/test/Target/LLVMIR/arm-sve.mlir
@@ -191,44 +191,44 @@ llvm.func @arm_sve_abs_diff(%arg0: vector<[4]xi32>,
 }
 
 // CHECK-LABEL: define void @memcopy
-llvm.func @memcopy(%arg0: !llvm.ptr<f32>, %arg1: !llvm.ptr<f32>,
+llvm.func @memcopy(%arg0: !llvm.ptr, %arg1: !llvm.ptr,
                    %arg2: i64, %arg3: i64, %arg4: i64,
-                   %arg5: !llvm.ptr<f32>, %arg6: !llvm.ptr<f32>,
+                   %arg5: !llvm.ptr, %arg6: !llvm.ptr,
                    %arg7: i64, %arg8: i64, %arg9: i64,
                    %arg10: i64) {
-  %0 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64,
                                        array<1 x i64>, array<1 x i64>)>
-  %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64,
                                                      array<1 x i64>,
                                                      array<1 x i64>)>
-  %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64,
                                                      array<1 x i64>,
                                                      array<1 x i64>)>
-  %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64,
                                                      array<1 x i64>,
                                                      array<1 x i64>)>
-  %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64,
                                                         array<1 x i64>,
                                                         array<1 x i64>)>
-  %5 = llvm.insertvalue %arg4, %4[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %5 = llvm.insertvalue %arg4, %4[4, 0] : !llvm.struct<(ptr, ptr, i64,
                                                         array<1 x i64>,
                                                         array<1 x i64>)>
-  %6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %6 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64,
                                        array<1 x i64>,
                                        array<1 x i64>)>
-  %7 = llvm.insertvalue %arg5, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %7 = llvm.insertvalue %arg5, %6[0] : !llvm.struct<(ptr, ptr, i64,
                                                      array<1 x i64>,
                                                      array<1 x i64>)>
-  %8 = llvm.insertvalue %arg6, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %8 = llvm.insertvalue %arg6, %7[1] : !llvm.struct<(ptr, ptr, i64,
                                                      array<1 x i64>,
                                                      array<1 x i64>)>
-  %9 = llvm.insertvalue %arg7, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %9 = llvm.insertvalue %arg7, %8[2] : !llvm.struct<(ptr, ptr, i64,
                                                      array<1 x i64>,
                                                      array<1 x i64>)>
-  %10 = llvm.insertvalue %arg8, %9[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %10 = llvm.insertvalue %arg8, %9[3, 0] : !llvm.struct<(ptr, ptr, i64,
                                                          array<1 x i64>,
                                                          array<1 x i64>)>
-  %11 = llvm.insertvalue %arg9, %10[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %11 = llvm.insertvalue %arg9, %10[4, 0] : !llvm.struct<(ptr, ptr, i64,
                                                          array<1 x i64>,
                                                          array<1 x i64>)>
   %12 = llvm.mlir.constant(0 : index) : i64
@@ -243,23 +243,21 @@ llvm.func @memcopy(%arg0: !llvm.ptr<f32>, %arg1: !llvm.ptr<f32>,
   llvm.cond_br %17, ^bb2, ^bb3
 ^bb2:
   // CHECK: extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] }
-  %18 = llvm.extractvalue %5[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %18 = llvm.extractvalue %5[1] : !llvm.struct<(ptr, ptr, i64,
                                                 array<1 x i64>,
                                                 array<1 x i64>)>
   // CHECK: getelementptr float, ptr
-  %19 = llvm.getelementptr %18[%16] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  %20 = llvm.bitcast %19 : !llvm.ptr<f32> to !llvm.ptr<vector<[4]xf32>>
+  %19 = llvm.getelementptr %18[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
   // CHECK: load <vscale x 4 x float>, ptr
-  %21 = llvm.load %20 : !llvm.ptr<vector<[4]xf32>>
+  %21 = llvm.load %19 : !llvm.ptr -> vector<[4]xf32>
   // CHECK: extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] }
-  %22 = llvm.extractvalue %11[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64,
+  %22 = llvm.extractvalue %11[1] : !llvm.struct<(ptr, ptr, i64,
                                                  array<1 x i64>,
                                                  array<1 x i64>)>
   // CHECK: getelementptr float, ptr
-  %23 = llvm.getelementptr %22[%16] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  %24 = llvm.bitcast %23 : !llvm.ptr<f32> to !llvm.ptr<vector<[4]xf32>>
+  %23 = llvm.getelementptr %22[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f32
   // CHECK: store <vscale x 4 x float> %{{[0-9]+}}, ptr %{{[0-9]+}}
-  llvm.store %21, %24 : !llvm.ptr<vector<[4]xf32>>
+  llvm.store %21, %23 : vector<[4]xf32>, !llvm.ptr
   %25 = llvm.add %16, %15  : i64
   llvm.br ^bb1(%25 : i64)
 ^bb3:
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index 8d1734d7cdc3117..ea962c66cb8eff9 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -89,13 +89,13 @@ llvm.func @func_no_debug() {
 llvm.func @func_with_debug(%arg: i64) {
   // CHECK: %[[ALLOC:.*]] = alloca
   %allocCount = llvm.mlir.constant(1 : i32) : i32
-  %alloc = llvm.alloca %allocCount x i64 : (i32) -> !llvm.ptr<i64>
+  %alloc = llvm.alloca %allocCount x i64 : (i32) -> !llvm.ptr
 
   // CHECK: call void @llvm.dbg.value(metadata i64 %[[ARG]], metadata ![[VAR_LOC:[0-9]+]], metadata !DIExpression())
   // CHECK: call void @llvm.dbg.declare(metadata ptr %[[ALLOC]], metadata ![[ADDR_LOC:[0-9]+]], metadata !DIExpression())
   // CHECK: call void @llvm.dbg.value(metadata i64 %[[ARG]], metadata ![[NO_NAME_VAR:[0-9]+]], metadata !DIExpression())
   llvm.intr.dbg.value #variable = %arg : i64
-  llvm.intr.dbg.declare #variableAddr = %alloc : !llvm.ptr<i64>
+  llvm.intr.dbg.declare #variableAddr = %alloc : !llvm.ptr
   llvm.intr.dbg.value #noNameVariable= %arg : i64
 
   // CHECK: call void @func_no_debug(), !dbg ![[CALLSITE_LOC:[0-9]+]]
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index d23991b65523fcf..e586c0cd2720e48 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -427,16 +427,16 @@ llvm.func @masked_load_store_intrinsics(%A: !llvm.ptr, %mask: vector<7xi1>) {
 }
 
 // CHECK-LABEL: @masked_gather_scatter_intrinsics
-llvm.func @masked_gather_scatter_intrinsics(%M: !llvm.vec<7 x ptr<f32>>, %mask: vector<7xi1>) {
+llvm.func @masked_gather_scatter_intrinsics(%M: !llvm.vec<7 x ptr>, %mask: vector<7xi1>) {
   // CHECK: call <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr> %{{.*}}, i32 1, <7 x i1> %{{.*}}, <7 x float> poison)
   %a = llvm.intr.masked.gather %M, %mask { alignment = 1: i32} :
-      (!llvm.vec<7 x ptr<f32>>, vector<7xi1>) -> vector<7xf32>
+      (!llvm.vec<7 x ptr>, vector<7xi1>) -> vector<7xf32>
   // CHECK: call <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr> %{{.*}}, i32 1, <7 x i1> %{{.*}}, <7 x float> %{{.*}})
   %b = llvm.intr.masked.gather %M, %mask, %a { alignment = 1: i32} :
-      (!llvm.vec<7 x ptr<f32>>, vector<7xi1>, vector<7xf32>) -> vector<7xf32>
+      (!llvm.vec<7 x ptr>, vector<7xi1>, vector<7xf32>) -> vector<7xf32>
   // CHECK: call void @llvm.masked.scatter.v7f32.v7p0(<7 x float> %{{.*}}, <7 x ptr> %{{.*}}, i32 1, <7 x i1> %{{.*}})
   llvm.intr.masked.scatter %b, %M, %mask { alignment = 1: i32} :
-      vector<7xf32>, vector<7xi1> into !llvm.vec<7 x ptr<f32>>
+      vector<7xf32>, vector<7xi1> into !llvm.vec<7 x ptr>
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index 2d6ccff2d436fea..9b14f5814987d99 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -229,9 +229,9 @@ llvm.func @masked_gather_intr_wrong_type(%ptrs : vector<7xf32>, %mask : vector<7
 
 // -----
 
-llvm.func @masked_scatter_intr_wrong_type(%vec : f32, %ptrs : !llvm.vec<7xptr<f32>>, %mask : vector<7xi1>) {
+llvm.func @masked_scatter_intr_wrong_type(%vec : f32, %ptrs : !llvm.vec<7xptr>, %mask : vector<7xi1>) {
   // expected-error @below{{op operand #0 must be LLVM dialect-compatible vector type, but got 'f32'}}
-  llvm.intr.masked.scatter %vec, %ptrs, %mask { alignment = 1: i32} : f32, vector<7xi1> into !llvm.vec<7xptr<f32>>
+  llvm.intr.masked.scatter %vec, %ptrs, %mask { alignment = 1: i32} : f32, vector<7xi1> into !llvm.vec<7xptr>
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/llvmir-types.mlir b/mlir/test/Target/LLVMIR/llvmir-types.mlir
index a92d46dfadfe25c..c85fa0101c00d74 100644
--- a/mlir/test/Target/LLVMIR/llvmir-types.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-types.mlir
@@ -40,7 +40,7 @@ llvm.func @f_void_variadic(...)
 // CHECK: declare void @f_void_i32_i32_variadic(i32, i32, ...)
 llvm.func @f_void_i32_i32_variadic(i32, i32, ...)
 // CHECK: declare ptr @f_f_i32_i32()
-llvm.func @f_f_i32_i32() -> !llvm.ptr<func<i32 (i32)>>
+llvm.func @f_f_i32_i32() -> !llvm.ptr
 
 //
 // Integers.
@@ -65,22 +65,12 @@ llvm.func @return_i129() -> i129
 // Pointers.
 //
 
-// CHECK: declare ptr @return_pi8()
-llvm.func @return_pi8() -> !llvm.ptr<i8>
-// CHECK: declare ptr @return_pfloat()
-llvm.func @return_pfloat() -> !llvm.ptr<f32>
-// CHECK: declare ptr @return_ppi8()
-llvm.func @return_ppi8() -> !llvm.ptr<ptr<i8>>
-// CHECK: declare ptr @return_pppppi8()
-llvm.func @return_pppppi8() -> !llvm.ptr<ptr<ptr<ptr<ptr<i8>>>>>
-// CHECK: declare ptr @return_pi8_0()
-llvm.func @return_pi8_0() -> !llvm.ptr<i8, 0>
-// CHECK: declare ptr addrspace(1) @return_pi8_1()
-llvm.func @return_pi8_1() -> !llvm.ptr<i8, 1>
-// CHECK: declare ptr addrspace(42) @return_pi8_42()
-llvm.func @return_pi8_42() -> !llvm.ptr<i8, 42>
-// CHECK: declare ptr addrspace(9) @return_ppi8_42_9()
-llvm.func @return_ppi8_42_9() -> !llvm.ptr<ptr<i8, 42>, 9>
+// CHECK: declare ptr @return_p()
+llvm.func @return_p() -> !llvm.ptr
+// CHECK: declare ptr addrspace(1) @return_p_1()
+llvm.func @return_p_1() -> !llvm.ptr<1>
+// CHECK: declare ptr addrspace(42) @return_p_42()
+llvm.func @return_p_42() -> !llvm.ptr<42>
 
 //
 // Vectors.
@@ -97,7 +87,7 @@ llvm.func @return_vs_4_i32() -> !llvm.vec<?x4 x i32>
 // CHECK: declare <vscale x 8 x half> @return_vs_8_half()
 llvm.func @return_vs_8_half() -> !llvm.vec<?x8 x f16>
 // CHECK: declare <4 x ptr> @return_v_4_pi8()
-llvm.func @return_v_4_pi8() -> !llvm.vec<4xptr<i8>>
+llvm.func @return_v_4_pi8() -> !llvm.vec<4xptr>
 
 //
 // Arrays.
@@ -107,8 +97,8 @@ llvm.func @return_v_4_pi8() -> !llvm.vec<4xptr<i8>>
 llvm.func @return_a10_i32() -> !llvm.array<10 x i32>
 // CHECK: declare [8 x float] @return_a8_float()
 llvm.func @return_a8_float() -> !llvm.array<8 x f32>
-// CHECK: declare [10 x ptr addrspace(4)] @return_a10_pi32_4()
-llvm.func @return_a10_pi32_4() -> !llvm.array<10 x ptr<i32, 4>>
+// CHECK: declare [10 x ptr addrspace(4)] @return_a10_p_4()
+llvm.func @return_a10_p_4() -> !llvm.array<10 x ptr<4>>
 // CHECK: declare [10 x [4 x float]] @return_a10_a4_float()
 llvm.func @return_a10_a4_float() -> !llvm.array<10 x array<4 x f32>>
 
@@ -160,12 +150,9 @@ llvm.func @return_target_ext_params() -> !llvm.target<"target-params", i32, f64,
 // CHECK: %empty = type {}
 // CHECK: %opaque = type opaque
 // CHECK: %long = type { i32, { i32, i1 }, float, ptr }
-// CHECK: %self-recursive = type { ptr }
 // CHECK: %unpacked = type { i32 }
 // CHECK: %packed = type <{ i32 }>
 // CHECK: %"name with spaces and !^$@$#" = type <{ i32 }>
-// CHECK: %mutually-a = type { ptr }
-// CHECK: %mutually-b = type { ptr addrspace(3) }
 // CHECK: %struct-of-arrays = type { [10 x i32] }
 // CHECK: %array-of-structs = type { i32 }
 
@@ -174,9 +161,7 @@ llvm.func @return_s_empty() -> !llvm.struct<"empty", ()>
 // CHECK: declare %opaque
 llvm.func @return_s_opaque() -> !llvm.struct<"opaque", opaque>
 // CHECK: declare %long
-llvm.func @return_s_long() -> !llvm.struct<"long", (i32, struct<(i32, i1)>, f32, ptr<func<void ()>>)>
-// CHECK: declare %self-recursive
-llvm.func @return_s_self_recursive() -> !llvm.struct<"self-recursive", (ptr<struct<"self-recursive">>)>
+llvm.func @return_s_long() -> !llvm.struct<"long", (i32, struct<(i32, i1)>, f32, ptr)>
 // CHECK: declare %unpacked
 llvm.func @return_s_unpacked() -> !llvm.struct<"unpacked", (i32)>
 // CHECK: declare %packed
@@ -184,14 +169,7 @@ llvm.func @return_s_packed() -> !llvm.struct<"packed", packed (i32)>
 // CHECK: declare %"name with spaces and !^$@$#"
 llvm.func @return_s_symbols() -> !llvm.struct<"name with spaces and !^$@$#", packed (i32)>
 
-// CHECK: declare %mutually-a
-llvm.func @return_s_mutually_a() -> !llvm.struct<"mutually-a", (ptr<struct<"mutually-b", (ptr<struct<"mutually-a">, 3>)>>)>
-// CHECK: declare %mutually-b
-llvm.func @return_s_mutually_b() -> !llvm.struct<"mutually-b", (ptr<struct<"mutually-a", (ptr<struct<"mutually-b">>)>, 3>)>
-
 // CHECK: declare %struct-of-arrays
 llvm.func @return_s_struct_of_arrays() -> !llvm.struct<"struct-of-arrays", (array<10 x i32>)>
 // CHECK: declare [10 x %array-of-structs]
 llvm.func @return_s_array_of_structs() -> !llvm.array<10 x struct<"array-of-structs", (i32)>>
-// CHECK: declare ptr
-llvm.func @return_s_ptr_to_struct() -> !llvm.ptr<struct<"ptr-to-struct", (i8)>>
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 7da44b6fbe1ab33..3f27b247edd3e67 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -64,11 +64,11 @@ llvm.mlir.global external @explicit_undef() : i32 {
 }
 
 // CHECK: @int_gep = internal constant ptr getelementptr (i32, ptr @i32_global, i32 2)
-llvm.mlir.global internal constant @int_gep() : !llvm.ptr<i32> {
-  %addr = llvm.mlir.addressof @i32_global : !llvm.ptr<i32>
+llvm.mlir.global internal constant @int_gep() : !llvm.ptr {
+  %addr = llvm.mlir.addressof @i32_global : !llvm.ptr
   %_c0 = llvm.mlir.constant(2: i32) :i32
-  %gepinit = llvm.getelementptr %addr[%_c0] : (!llvm.ptr<i32>, i32) -> !llvm.ptr<i32>
-  llvm.return %gepinit : !llvm.ptr<i32>
+  %gepinit = llvm.getelementptr %addr[%_c0] : (!llvm.ptr, i32) -> !llvm.ptr, i32
+  llvm.return %gepinit : !llvm.ptr
 }
 
 // CHECK{LITERAL}: @dense_float_vector = internal global <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
@@ -177,7 +177,7 @@ llvm.mlir.global internal constant @sectionvar("teststring")  {section = ".mysec
 //
 
 // CHECK: declare ptr @malloc(i64)
-llvm.func @malloc(i64) -> !llvm.ptr<i8>
+llvm.func @malloc(i64) -> !llvm.ptr
 // CHECK: declare void @free(ptr)
 
 
@@ -197,15 +197,15 @@ llvm.func @empty() {
 llvm.func @global_refs() {
   // Check load from globals.
   // CHECK: load i32, ptr @i32_global
-  %0 = llvm.mlir.addressof @i32_global : !llvm.ptr<i32>
-  %1 = llvm.load %0 : !llvm.ptr<i32>
+  %0 = llvm.mlir.addressof @i32_global : !llvm.ptr
+  %1 = llvm.load %0 : !llvm.ptr -> i32
 
   // Check the contracted form of load from array constants.
   // CHECK: load i8, ptr @string_const
-  %2 = llvm.mlir.addressof @string_const : !llvm.ptr<array<6 x i8>>
+  %2 = llvm.mlir.addressof @string_const : !llvm.ptr
   %c0 = llvm.mlir.constant(0 : index) : i64
-  %3 = llvm.getelementptr %2[%c0, %c0] : (!llvm.ptr<array<6 x i8>>, i64, i64) -> !llvm.ptr<i8>
-  %4 = llvm.load %3 : !llvm.ptr<i8>
+  %3 = llvm.getelementptr %2[%c0, %c0] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.array<6 x i8>
+  %4 = llvm.load %3 : !llvm.ptr -> i8
 
   llvm.return
 }
@@ -547,12 +547,11 @@ llvm.func @memref_alloc() {
   %0 = llvm.mlir.constant(10 : index) : i64
   %1 = llvm.mlir.constant(10 : index) : i64
   %2 = llvm.mul %0, %1 : i64
-  %3 = llvm.mlir.undef : !llvm.struct<(ptr<f32>)>
+  %3 = llvm.mlir.undef : !llvm.struct<(ptr)>
   %4 = llvm.mlir.constant(4 : index) : i64
   %5 = llvm.mul %2, %4 : i64
-  %6 = llvm.call @malloc(%5) : (i64) -> !llvm.ptr<i8>
-  %7 = llvm.bitcast %6 : !llvm.ptr<i8> to !llvm.ptr<f32>
-  %8 = llvm.insertvalue %7, %3[0] : !llvm.struct<(ptr<f32>)>
+  %6 = llvm.call @malloc(%5) : (i64) -> !llvm.ptr
+  %7 = llvm.insertvalue %6, %3[0] : !llvm.struct<(ptr)>
 // CHECK-NEXT: ret void
   llvm.return
 }
@@ -566,12 +565,11 @@ llvm.func @store_load_static() {
 // CHECK-NEXT: %{{[0-9]+}} = call ptr @malloc(i64 40)
 // CHECK-NEXT: %{{[0-9]+}} = insertvalue { ptr } undef, ptr %{{[0-9]+}}, 0
   %0 = llvm.mlir.constant(10 : index) : i64
-  %1 = llvm.mlir.undef : !llvm.struct<(ptr<f32>)>
+  %1 = llvm.mlir.undef : !llvm.struct<(ptr)>
   %2 = llvm.mlir.constant(4 : index) : i64
   %3 = llvm.mul %0, %2 : i64
-  %4 = llvm.call @malloc(%3) : (i64) -> !llvm.ptr<i8>
-  %5 = llvm.bitcast %4 : !llvm.ptr<i8> to !llvm.ptr<f32>
-  %6 = llvm.insertvalue %5, %1[0] : !llvm.struct<(ptr<f32>)>
+  %4 = llvm.call @malloc(%3) : (i64) -> !llvm.ptr
+  %6 = llvm.insertvalue %4, %1[0] : !llvm.struct<(ptr)>
   %7 = llvm.mlir.constant(1.000000e+00 : f32) : f32
   llvm.br ^bb1
 ^bb1:   // pred: ^bb0
@@ -589,9 +587,9 @@ llvm.func @store_load_static() {
 // CHECK-NEXT: %{{[0-9]+}} = getelementptr float, ptr %{{[0-9]+}}, i64 %{{[0-9]+}}
 // CHECK-NEXT: store float 1.000000e+00, ptr %{{[0-9]+}}
   %12 = llvm.mlir.constant(10 : index) : i64
-  %13 = llvm.extractvalue %6[0] : !llvm.struct<(ptr<f32>)>
-  %14 = llvm.getelementptr %13[%10] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  llvm.store %7, %14 : !llvm.ptr<f32>
+  %13 = llvm.extractvalue %6[0] : !llvm.struct<(ptr)>
+  %14 = llvm.getelementptr %13[%10] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  llvm.store %7, %14 : f32, !llvm.ptr
   %15 = llvm.mlir.constant(1 : index) : i64
 // CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
   %16 = llvm.add %10, %15 : i64
@@ -614,9 +612,9 @@ llvm.func @store_load_static() {
 // CHECK-NEXT: %{{[0-9]+}} = getelementptr float, ptr %{{[0-9]+}}, i64 %{{[0-9]+}}
 // CHECK-NEXT: %{{[0-9]+}} = load float, ptr %{{[0-9]+}}
   %21 = llvm.mlir.constant(10 : index) : i64
-  %22 = llvm.extractvalue %6[0] : !llvm.struct<(ptr<f32>)>
-  %23 = llvm.getelementptr %22[%19] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  %24 = llvm.load %23 : !llvm.ptr<f32>
+  %22 = llvm.extractvalue %6[0] : !llvm.struct<(ptr)>
+  %23 = llvm.getelementptr %22[%19] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  %24 = llvm.load %23 : !llvm.ptr -> f32
   %25 = llvm.mlir.constant(1 : index) : i64
 // CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
   %26 = llvm.add %19, %25 : i64
@@ -633,13 +631,12 @@ llvm.func @store_load_dynamic(%arg0: i64) {
 // CHECK-NEXT: %{{[0-9]+}} = call ptr @malloc(i64 %{{[0-9]+}})
 // CHECK-NEXT: %{{[0-9]+}} = insertvalue { ptr, i64 } undef, ptr %{{[0-9]+}}, 0
 // CHECK-NEXT: %{{[0-9]+}} = insertvalue { ptr, i64 } %{{[0-9]+}}, i64 %{{[0-9]+}}, 1
-  %0 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, i64)>
+  %0 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
   %1 = llvm.mlir.constant(4 : index) : i64
   %2 = llvm.mul %arg0, %1 : i64
-  %3 = llvm.call @malloc(%2) : (i64) -> !llvm.ptr<i8>
-  %4 = llvm.bitcast %3 : !llvm.ptr<i8> to !llvm.ptr<f32>
-  %5 = llvm.insertvalue %4, %0[0] : !llvm.struct<(ptr<f32>, i64)>
-  %6 = llvm.insertvalue %arg0, %5[1] : !llvm.struct<(ptr<f32>, i64)>
+  %3 = llvm.call @malloc(%2) : (i64) -> !llvm.ptr
+  %5 = llvm.insertvalue %3, %0[0] : !llvm.struct<(ptr, i64)>
+  %6 = llvm.insertvalue %arg0, %5[1] : !llvm.struct<(ptr, i64)>
   %7 = llvm.mlir.constant(1.000000e+00 : f32) : f32
 // CHECK-NEXT: br label %{{[0-9]+}}
   llvm.br ^bb1
@@ -657,10 +654,10 @@ llvm.func @store_load_dynamic(%arg0: i64) {
 // CHECK-NEXT: %{{[0-9]+}} = extractvalue { ptr, i64 } %{{[0-9]+}}, 0
 // CHECK-NEXT: %{{[0-9]+}} = getelementptr float, ptr %{{[0-9]+}}, i64 %{{[0-9]+}}
 // CHECK-NEXT: store float 1.000000e+00, ptr %{{[0-9]+}}
-  %11 = llvm.extractvalue %6[1] : !llvm.struct<(ptr<f32>, i64)>
-  %12 = llvm.extractvalue %6[0] : !llvm.struct<(ptr<f32>, i64)>
-  %13 = llvm.getelementptr %12[%9] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  llvm.store %7, %13 : !llvm.ptr<f32>
+  %11 = llvm.extractvalue %6[1] : !llvm.struct<(ptr, i64)>
+  %12 = llvm.extractvalue %6[0] : !llvm.struct<(ptr, i64)>
+  %13 = llvm.getelementptr %12[%9] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  llvm.store %7, %13 : f32, !llvm.ptr
   %14 = llvm.mlir.constant(1 : index) : i64
 // CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
   %15 = llvm.add %9, %14 : i64
@@ -682,10 +679,10 @@ llvm.func @store_load_dynamic(%arg0: i64) {
 // CHECK-NEXT: %{{[0-9]+}} = extractvalue { ptr, i64 } %{{[0-9]+}}, 0
 // CHECK-NEXT: %{{[0-9]+}} = getelementptr float, ptr %{{[0-9]+}}, i64 %{{[0-9]+}}
 // CHECK-NEXT: %{{[0-9]+}} = load float, ptr %{{[0-9]+}}
-  %19 = llvm.extractvalue %6[1] : !llvm.struct<(ptr<f32>, i64)>
-  %20 = llvm.extractvalue %6[0] : !llvm.struct<(ptr<f32>, i64)>
-  %21 = llvm.getelementptr %20[%17] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  %22 = llvm.load %21 : !llvm.ptr<f32>
+  %19 = llvm.extractvalue %6[1] : !llvm.struct<(ptr, i64)>
+  %20 = llvm.extractvalue %6[0] : !llvm.struct<(ptr, i64)>
+  %21 = llvm.getelementptr %20[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  %22 = llvm.load %21 : !llvm.ptr -> f32
   %23 = llvm.mlir.constant(1 : index) : i64
 // CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, 1
   %24 = llvm.add %17, %23 : i64
@@ -712,14 +709,13 @@ llvm.func @store_load_mixed(%arg0: i64) {
   %3 = llvm.mul %1, %arg0 : i64
   %4 = llvm.mul %3, %2 : i64
   %5 = llvm.mul %4, %0 : i64
-  %6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, i64, i64)>
+  %6 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i64)>
   %7 = llvm.mlir.constant(4 : index) : i64
   %8 = llvm.mul %5, %7 : i64
-  %9 = llvm.call @malloc(%8) : (i64) -> !llvm.ptr<i8>
-  %10 = llvm.bitcast %9 : !llvm.ptr<i8> to !llvm.ptr<f32>
-  %11 = llvm.insertvalue %10, %6[0] : !llvm.struct<(ptr<f32>, i64, i64)>
-  %12 = llvm.insertvalue %arg0, %11[1] : !llvm.struct<(ptr<f32>, i64, i64)>
-  %13 = llvm.insertvalue %0, %12[2] : !llvm.struct<(ptr<f32>, i64, i64)>
+  %9 = llvm.call @malloc(%8) : (i64) -> !llvm.ptr
+  %11 = llvm.insertvalue %9, %6[0] : !llvm.struct<(ptr, i64, i64)>
+  %12 = llvm.insertvalue %arg0, %11[1] : !llvm.struct<(ptr, i64, i64)>
+  %13 = llvm.insertvalue %0, %12[2] : !llvm.struct<(ptr, i64, i64)>
 
 // CHECK-NEXT: %{{[0-9]+}} = call i64 @get_index()
 // CHECK-NEXT: %{{[0-9]+}} = call i64 @get_index()
@@ -740,18 +736,18 @@ llvm.func @store_load_mixed(%arg0: i64) {
 // CHECK-NEXT: %{{[0-9]+}} = extractvalue { ptr, i64, i64 } %{{[0-9]+}}, 0
 // CHECK-NEXT: %{{[0-9]+}} = getelementptr float, ptr %{{[0-9]+}}, i64 %{{[0-9]+}}
 // CHECK-NEXT: store float 4.200000e+01, ptr %{{[0-9]+}}
-  %20 = llvm.extractvalue %13[1] : !llvm.struct<(ptr<f32>, i64, i64)>
+  %20 = llvm.extractvalue %13[1] : !llvm.struct<(ptr, i64, i64)>
   %21 = llvm.mlir.constant(4 : index) : i64
-  %22 = llvm.extractvalue %13[2] : !llvm.struct<(ptr<f32>, i64, i64)>
+  %22 = llvm.extractvalue %13[2] : !llvm.struct<(ptr, i64, i64)>
   %23 = llvm.mul %14, %20 : i64
   %24 = llvm.add %23, %15 : i64
   %25 = llvm.mul %24, %21 : i64
   %26 = llvm.add %25, %16 : i64
   %27 = llvm.mul %26, %22 : i64
   %28 = llvm.add %27, %17 : i64
-  %29 = llvm.extractvalue %13[0] : !llvm.struct<(ptr<f32>, i64, i64)>
-  %30 = llvm.getelementptr %29[%28] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  llvm.store %18, %30 : !llvm.ptr<f32>
+  %29 = llvm.extractvalue %13[0] : !llvm.struct<(ptr, i64, i64)>
+  %30 = llvm.getelementptr %29[%28] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  llvm.store %18, %30 : f32, !llvm.ptr
 // CHECK-NEXT: %{{[0-9]+}} = extractvalue { ptr, i64, i64 } %{{[0-9]+}}, 1
 // CHECK-NEXT: %{{[0-9]+}} = extractvalue { ptr, i64, i64 } %{{[0-9]+}}, 2
 // CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, %{{[0-9]+}}
@@ -764,24 +760,24 @@ llvm.func @store_load_mixed(%arg0: i64) {
 // CHECK-NEXT: %{{[0-9]+}} = getelementptr float, ptr %{{[0-9]+}}, i64 %{{[0-9]+}}
 // CHECK-NEXT: %{{[0-9]+}} = load float, ptr %{{[0-9]+}}
   %31 = llvm.mlir.constant(2 : index) : i64
-  %32 = llvm.extractvalue %13[1] : !llvm.struct<(ptr<f32>, i64, i64)>
+  %32 = llvm.extractvalue %13[1] : !llvm.struct<(ptr, i64, i64)>
   %33 = llvm.mlir.constant(4 : index) : i64
-  %34 = llvm.extractvalue %13[2] : !llvm.struct<(ptr<f32>, i64, i64)>
+  %34 = llvm.extractvalue %13[2] : !llvm.struct<(ptr, i64, i64)>
   %35 = llvm.mul %17, %32 : i64
   %36 = llvm.add %35, %16 : i64
   %37 = llvm.mul %36, %33 : i64
   %38 = llvm.add %37, %15 : i64
   %39 = llvm.mul %38, %34 : i64
   %40 = llvm.add %39, %14 : i64
-  %41 = llvm.extractvalue %13[0] : !llvm.struct<(ptr<f32>, i64, i64)>
-  %42 = llvm.getelementptr %41[%40] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  %43 = llvm.load %42 : !llvm.ptr<f32>
+  %41 = llvm.extractvalue %13[0] : !llvm.struct<(ptr, i64, i64)>
+  %42 = llvm.getelementptr %41[%40] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  %43 = llvm.load %42 : !llvm.ptr -> f32
 // CHECK-NEXT: ret void
   llvm.return
 }
 
 // CHECK-LABEL: define { ptr, i64 } @memref_args_rets({ ptr } {{%.*}}, { ptr, i64 } {{%.*}}, { ptr, i64 } {{%.*}})
-llvm.func @memref_args_rets(%arg0: !llvm.struct<(ptr<f32>)>, %arg1: !llvm.struct<(ptr<f32>, i64)>, %arg2: !llvm.struct<(ptr<f32>, i64)>) -> !llvm.struct<(ptr<f32>, i64)> {
+llvm.func @memref_args_rets(%arg0: !llvm.struct<(ptr)>, %arg1: !llvm.struct<(ptr, i64)>, %arg2: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> {
   %0 = llvm.mlir.constant(7 : index) : i64
 // CHECK-NEXT: %{{[0-9]+}} = call i64 @get_index()
   %1 = llvm.call @get_index() : () -> i64
@@ -790,17 +786,17 @@ llvm.func @memref_args_rets(%arg0: !llvm.struct<(ptr<f32>)>, %arg1: !llvm.struct
 // CHECK-NEXT: %{{[0-9]+}} = getelementptr float, ptr %{{[0-9]+}}, i64 7
 // CHECK-NEXT: store float 4.200000e+01, ptr %{{[0-9]+}}
   %3 = llvm.mlir.constant(10 : index) : i64
-  %4 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr<f32>)>
-  %5 = llvm.getelementptr %4[%0] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  llvm.store %2, %5 : !llvm.ptr<f32>
+  %4 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr)>
+  %5 = llvm.getelementptr %4[%0] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  llvm.store %2, %5 : f32, !llvm.ptr
 // CHECK-NEXT: %{{[0-9]+}} = extractvalue { ptr, i64 } %{{[0-9]+}}, 1
 // CHECK-NEXT: %{{[0-9]+}} = extractvalue { ptr, i64 } %{{[0-9]+}}, 0
 // CHECK-NEXT: %{{[0-9]+}} = getelementptr float, ptr %{{[0-9]+}}, i64 7
 // CHECK-NEXT: store float 4.200000e+01, ptr %{{[0-9]+}}
-  %6 = llvm.extractvalue %arg1[1] : !llvm.struct<(ptr<f32>, i64)>
-  %7 = llvm.extractvalue %arg1[0] : !llvm.struct<(ptr<f32>, i64)>
-  %8 = llvm.getelementptr %7[%0] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  llvm.store %2, %8 : !llvm.ptr<f32>
+  %6 = llvm.extractvalue %arg1[1] : !llvm.struct<(ptr, i64)>
+  %7 = llvm.extractvalue %arg1[0] : !llvm.struct<(ptr, i64)>
+  %8 = llvm.getelementptr %7[%0] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  llvm.store %2, %8 : f32, !llvm.ptr
 // CHECK-NEXT: %{{[0-9]+}} = extractvalue { ptr, i64 } %{{[0-9]+}}, 1
 // CHECK-NEXT: %{{[0-9]+}} = mul i64 7, %{{[0-9]+}}
 // CHECK-NEXT: %{{[0-9]+}} = add i64 %{{[0-9]+}}, %{{[0-9]+}}
@@ -808,12 +804,12 @@ llvm.func @memref_args_rets(%arg0: !llvm.struct<(ptr<f32>)>, %arg1: !llvm.struct
 // CHECK-NEXT: %{{[0-9]+}} = getelementptr float, ptr %{{[0-9]+}}, i64 %{{[0-9]+}}
 // CHECK-NEXT: store float 4.200000e+01, ptr %{{[0-9]+}}
   %9 = llvm.mlir.constant(10 : index) : i64
-  %10 = llvm.extractvalue %arg2[1] : !llvm.struct<(ptr<f32>, i64)>
+  %10 = llvm.extractvalue %arg2[1] : !llvm.struct<(ptr, i64)>
   %11 = llvm.mul %0, %10 : i64
   %12 = llvm.add %11, %1 : i64
-  %13 = llvm.extractvalue %arg2[0] : !llvm.struct<(ptr<f32>, i64)>
-  %14 = llvm.getelementptr %13[%12] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  llvm.store %2, %14 : !llvm.ptr<f32>
+  %13 = llvm.extractvalue %arg2[0] : !llvm.struct<(ptr, i64)>
+  %14 = llvm.getelementptr %13[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  llvm.store %2, %14 : f32, !llvm.ptr
 // CHECK-NEXT: %{{[0-9]+}} = mul i64 10, %{{[0-9]+}}
 // CHECK-NEXT: %{{[0-9]+}} = mul i64 %{{[0-9]+}}, 4
 // CHECK-NEXT: %{{[0-9]+}} = call ptr @malloc(i64 %{{[0-9]+}})
@@ -821,28 +817,27 @@ llvm.func @memref_args_rets(%arg0: !llvm.struct<(ptr<f32>)>, %arg1: !llvm.struct
 // CHECK-NEXT: %{{[0-9]+}} = insertvalue { ptr, i64 } %{{[0-9]+}}, i64 %{{[0-9]+}}, 1
   %15 = llvm.mlir.constant(10 : index) : i64
   %16 = llvm.mul %15, %1 : i64
-  %17 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, i64)>
+  %17 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
   %18 = llvm.mlir.constant(4 : index) : i64
   %19 = llvm.mul %16, %18 : i64
-  %20 = llvm.call @malloc(%19) : (i64) -> !llvm.ptr<i8>
-  %21 = llvm.bitcast %20 : !llvm.ptr<i8> to !llvm.ptr<f32>
-  %22 = llvm.insertvalue %21, %17[0] : !llvm.struct<(ptr<f32>, i64)>
-  %23 = llvm.insertvalue %1, %22[1] : !llvm.struct<(ptr<f32>, i64)>
+  %20 = llvm.call @malloc(%19) : (i64) -> !llvm.ptr
+  %22 = llvm.insertvalue %20, %17[0] : !llvm.struct<(ptr, i64)>
+  %23 = llvm.insertvalue %1, %22[1] : !llvm.struct<(ptr, i64)>
 // CHECK-NEXT: ret { ptr, i64 } %{{[0-9]+}}
-  llvm.return %23 : !llvm.struct<(ptr<f32>, i64)>
+  llvm.return %23 : !llvm.struct<(ptr, i64)>
 }
 
 
 // CHECK-LABEL: define i64 @memref_dim({ ptr, i64, i64 } {{%.*}})
-llvm.func @memref_dim(%arg0: !llvm.struct<(ptr<f32>, i64, i64)>) -> i64 {
+llvm.func @memref_dim(%arg0: !llvm.struct<(ptr, i64, i64)>) -> i64 {
 // Expecting this to create an LLVM constant.
   %0 = llvm.mlir.constant(42 : index) : i64
 // CHECK-NEXT: %2 = extractvalue { ptr, i64, i64 } %0, 1
-  %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr<f32>, i64, i64)>
+  %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64, i64)>
 // Expecting this to create an LLVM constant.
   %2 = llvm.mlir.constant(10 : index) : i64
 // CHECK-NEXT: %3 = extractvalue { ptr, i64, i64 } %0, 2
-  %3 = llvm.extractvalue %arg0[2] : !llvm.struct<(ptr<f32>, i64, i64)>
+  %3 = llvm.extractvalue %arg0[2] : !llvm.struct<(ptr, i64, i64)>
 // Checking that the constant for d0 has been created.
 // CHECK-NEXT: %4 = add i64 42, %2
   %4 = llvm.add %0, %1 : i64
@@ -857,22 +852,22 @@ llvm.func @memref_dim(%arg0: !llvm.struct<(ptr<f32>, i64, i64)>) -> i64 {
 
 llvm.func @get_i64() -> i64
 llvm.func @get_f32() -> f32
-llvm.func @get_memref() -> !llvm.struct<(ptr<f32>, i64, i64)>
+llvm.func @get_memref() -> !llvm.struct<(ptr, i64, i64)>
 
 // CHECK-LABEL: define { i64, float, { ptr, i64, i64 } } @multireturn()
-llvm.func @multireturn() -> !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)> {
+llvm.func @multireturn() -> !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)> {
   %0 = llvm.call @get_i64() : () -> i64
   %1 = llvm.call @get_f32() : () -> f32
-  %2 = llvm.call @get_memref() : () -> !llvm.struct<(ptr<f32>, i64, i64)>
+  %2 = llvm.call @get_memref() : () -> !llvm.struct<(ptr, i64, i64)>
 // CHECK:        %{{[0-9]+}} = insertvalue { i64, float, { ptr, i64, i64 } } undef, i64 %{{[0-9]+}}, 0
 // CHECK-NEXT:   %{{[0-9]+}} = insertvalue { i64, float, { ptr, i64, i64 } } %{{[0-9]+}}, float %{{[0-9]+}}, 1
 // CHECK-NEXT:   %{{[0-9]+}} = insertvalue { i64, float, { ptr, i64, i64 } } %{{[0-9]+}}, { ptr, i64, i64 } %{{[0-9]+}}, 2
 // CHECK-NEXT:   ret { i64, float, { ptr, i64, i64 } } %{{[0-9]+}}
-  %3 = llvm.mlir.undef : !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)>
-  %4 = llvm.insertvalue %0, %3[0] : !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)>
-  %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)>
-  %6 = llvm.insertvalue %2, %5[2] : !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)>
-  llvm.return %6 : !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)>
+  %3 = llvm.mlir.undef : !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)>
+  %4 = llvm.insertvalue %0, %3[0] : !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)>
+  %5 = llvm.insertvalue %1, %4[1] : !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)>
+  %6 = llvm.insertvalue %2, %5[2] : !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)>
+  llvm.return %6 : !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)>
 }
 
 
@@ -882,10 +877,10 @@ llvm.func @multireturn_caller() {
 // CHECK-NEXT:   [[ret0:%[0-9]+]] = extractvalue { i64, float, { ptr, i64, i64 } } %1, 0
 // CHECK-NEXT:   [[ret1:%[0-9]+]] = extractvalue { i64, float, { ptr, i64, i64 } } %1, 1
 // CHECK-NEXT:   [[ret2:%[0-9]+]] = extractvalue { i64, float, { ptr, i64, i64 } } %1, 2
-  %0 = llvm.call @multireturn() : () -> !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)>
-  %1 = llvm.extractvalue %0[0] : !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)>
-  %2 = llvm.extractvalue %0[1] : !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)>
-  %3 = llvm.extractvalue %0[2] : !llvm.struct<(i64, f32, struct<(ptr<f32>, i64, i64)>)>
+  %0 = llvm.call @multireturn() : () -> !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)>
+  %1 = llvm.extractvalue %0[0] : !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)>
+  %2 = llvm.extractvalue %0[1] : !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)>
+  %3 = llvm.extractvalue %0[2] : !llvm.struct<(i64, f32, struct<(ptr, i64, i64)>)>
   %4 = llvm.mlir.constant(42) : i64
 // CHECK:   add i64 [[ret0]], 42
   %5 = llvm.add %1, %4 : i64
@@ -895,18 +890,18 @@ llvm.func @multireturn_caller() {
   %8 = llvm.mlir.constant(0 : index) : i64
   %9 = llvm.mlir.constant(42 : index) : i64
 // CHECK:   extractvalue { ptr, i64, i64 } [[ret2]], 0
-  %10 = llvm.extractvalue %3[1] : !llvm.struct<(ptr<f32>, i64, i64)>
+  %10 = llvm.extractvalue %3[1] : !llvm.struct<(ptr, i64, i64)>
   %11 = llvm.mlir.constant(10 : index) : i64
-  %12 = llvm.extractvalue %3[2] : !llvm.struct<(ptr<f32>, i64, i64)>
+  %12 = llvm.extractvalue %3[2] : !llvm.struct<(ptr, i64, i64)>
   %13 = llvm.mul %8, %10 : i64
   %14 = llvm.add %13, %8 : i64
   %15 = llvm.mul %14, %11 : i64
   %16 = llvm.add %15, %8 : i64
   %17 = llvm.mul %16, %12 : i64
   %18 = llvm.add %17, %8 : i64
-  %19 = llvm.extractvalue %3[0] : !llvm.struct<(ptr<f32>, i64, i64)>
-  %20 = llvm.getelementptr %19[%18] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  %21 = llvm.load %20 : !llvm.ptr<f32>
+  %19 = llvm.extractvalue %3[0] : !llvm.struct<(ptr, i64, i64)>
+  %20 = llvm.getelementptr %19[%18] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+  %21 = llvm.load %20 : !llvm.ptr -> f32
   llvm.return
 }
 
@@ -987,11 +982,6 @@ llvm.func @vector_splat_nonzero_scalable() -> vector<[4]xf32> {
   llvm.return %0 : vector<[4]xf32>
 }
 
-// CHECK-LABEL: @f8_ptrs(ptr {{%.*}}, ptr {{%.*}})
-llvm.func @f8_ptrs(%arg0: !llvm.ptr<f8E5M2>, %arg1: !llvm.ptr<f8E4M3FN>) {
-  llvm.return
-}
-
 // CHECK-LABEL: @ops
 llvm.func @ops(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: i32) -> !llvm.struct<(f32, i32)> {
 // CHECK-NEXT: fsub float %0, %1
@@ -1040,12 +1030,12 @@ llvm.func @ops(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: i32) -> !llvm.struct<(
 }
 
 // CHECK-LABEL: @gep
-llvm.func @gep(%ptr: !llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, %idx: i64,
-               %ptr2: !llvm.ptr<struct<(array<10 x f32>)>>) {
+llvm.func @gep(%ptr: !llvm.ptr, %idx: i64,
+               %ptr2: !llvm.ptr) {
   // CHECK: = getelementptr { i32, { i32, float } }, ptr %{{.*}}, i64 %{{.*}}, i32 1, i32 0
-  llvm.getelementptr %ptr[%idx, 1, 0] : (!llvm.ptr<struct<(i32, struct<(i32, f32)>)>>, i64) -> !llvm.ptr<i32>
+  llvm.getelementptr %ptr[%idx, 1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(i32, struct<(i32, f32)>)>
   // CHECK: = getelementptr inbounds { [10 x float] }, ptr %{{.*}}, i64 %{{.*}}, i32 0, i64 %{{.*}}
-  llvm.getelementptr inbounds %ptr2[%idx, 0, %idx] : (!llvm.ptr<struct<(array<10 x f32>)>>, i64, i64) -> !llvm.ptr<f32>
+  llvm.getelementptr inbounds %ptr2[%idx, 0, %idx] : (!llvm.ptr, i64, i64) -> !llvm.ptr, !llvm.struct<(array<10 x f32>)>
   llvm.return
 }
 
@@ -1093,44 +1083,44 @@ llvm.func @cond_br_arguments(%arg0: i1, %arg1: i1) {
 }
 
 // CHECK-LABEL: define void @llvm_noalias(ptr noalias {{%*.}})
-llvm.func @llvm_noalias(%arg0: !llvm.ptr<f32> {llvm.noalias}) {
+llvm.func @llvm_noalias(%arg0: !llvm.ptr {llvm.noalias}) {
   llvm.return
 }
 
 // CHECK-LABEL: declare void @llvm_noalias_decl(ptr noalias)
-llvm.func @llvm_noalias_decl(!llvm.ptr<f32> {llvm.noalias})
+llvm.func @llvm_noalias_decl(!llvm.ptr {llvm.noalias})
 
 // CHECK-LABEL: define void @byrefattr(ptr byref(i32) %
-llvm.func @byrefattr(%arg0: !llvm.ptr<i32> {llvm.byref = i32}) {
+llvm.func @byrefattr(%arg0: !llvm.ptr {llvm.byref = i32}) {
   llvm.return
 }
 
 // CHECK-LABEL: declare void @byrefattr_decl(ptr byref(i32))
-llvm.func @byrefattr_decl(!llvm.ptr<i32> {llvm.byref = i32})
+llvm.func @byrefattr_decl(!llvm.ptr {llvm.byref = i32})
 
 // CHECK-LABEL: define void @byvalattr(ptr byval(i32) %
-llvm.func @byvalattr(%arg0: !llvm.ptr<i32> {llvm.byval = i32}) {
+llvm.func @byvalattr(%arg0: !llvm.ptr {llvm.byval = i32}) {
   llvm.return
 }
 
 // CHECK-LABEL: declare void @byvalattr_decl(ptr byval(i32))
-llvm.func @byvalattr_decl(!llvm.ptr<i32> {llvm.byval = i32})
+llvm.func @byvalattr_decl(!llvm.ptr {llvm.byval = i32})
 
 // CHECK-LABEL: define void @sretattr(ptr sret(i32) %
-llvm.func @sretattr(%arg0: !llvm.ptr<i32> {llvm.sret = i32}) {
+llvm.func @sretattr(%arg0: !llvm.ptr {llvm.sret = i32}) {
   llvm.return
 }
 
 // CHECK-LABEL: declare void @sretattr_decl(ptr sret(i32))
-llvm.func @sretattr_decl(!llvm.ptr<i32> {llvm.sret = i32})
+llvm.func @sretattr_decl(!llvm.ptr {llvm.sret = i32})
 
 // CHECK-LABEL: define void @nestattr(ptr nest %
-llvm.func @nestattr(%arg0: !llvm.ptr<i32> {llvm.nest}) {
+llvm.func @nestattr(%arg0: !llvm.ptr {llvm.nest}) {
   llvm.return
 }
 
 // CHECK-LABEL: declare void @nestattr_decl(ptr nest)
-llvm.func @nestattr_decl(!llvm.ptr<i32> {llvm.nest})
+llvm.func @nestattr_decl(!llvm.ptr {llvm.nest})
 
 // CHECK-LABEL: define void @noundefattr(i32 noundef %
 llvm.func @noundefattr(%arg0: i32 {llvm.noundef}) {
@@ -1141,20 +1131,20 @@ llvm.func @noundefattr(%arg0: i32 {llvm.noundef}) {
 llvm.func @noundefattr_decl(i32 {llvm.noundef})
 
 // CHECK-LABEL: define void @llvm_align(ptr align 4 {{%*.}})
-llvm.func @llvm_align(%arg0: !llvm.ptr<f32> {llvm.align = 4}) {
+llvm.func @llvm_align(%arg0: !llvm.ptr {llvm.align = 4}) {
   llvm.return
 }
 
 // CHECK-LABEL: declare void @llvm_align_decl(ptr align 4)
-llvm.func @llvm_align_decl(!llvm.ptr<f32> {llvm.align = 4})
+llvm.func @llvm_align_decl(!llvm.ptr {llvm.align = 4})
 
 // CHECK-LABEL: define void @inallocaattr(ptr inalloca(i32) %
-llvm.func @inallocaattr(%arg0: !llvm.ptr<i32> {llvm.inalloca = i32}) {
+llvm.func @inallocaattr(%arg0: !llvm.ptr {llvm.inalloca = i32}) {
   llvm.return
 }
 
 // CHECK-LABEL: declare void @inallocaattr_decl(ptr inalloca(i32))
-llvm.func @inallocaattr_decl(!llvm.ptr<i32> {llvm.inalloca = i32})
+llvm.func @inallocaattr_decl(!llvm.ptr {llvm.inalloca = i32})
 
 // CHECK-LABEL: define void @signextattr(i1 signext %
 llvm.func @signextattr(%arg0: i1 {llvm.signext}) {
@@ -1206,13 +1196,13 @@ llvm.func @alignstackattr_decl(!llvm.ptr {llvm.alignstack = 32 : i64})
 llvm.func @writeonlyattr_decl(!llvm.ptr {llvm.writeonly})
 
 // CHECK-LABEL: declare align 4 ptr @alignattr_ret_decl()
-llvm.func @alignattr_ret_decl() -> (!llvm.ptr<i32> {llvm.align = 4})
+llvm.func @alignattr_ret_decl() -> (!llvm.ptr {llvm.align = 4})
 
 // CHECK-LABEL: declare noalias ptr @noaliasattr_ret_decl()
-llvm.func @noaliasattr_ret_decl() -> (!llvm.ptr<i32> {llvm.noalias})
+llvm.func @noaliasattr_ret_decl() -> (!llvm.ptr {llvm.noalias})
 
 // CHECK-LABEL: declare noundef ptr @noundefattr_ret_decl()
-llvm.func @noundefattr_ret_decl() -> (!llvm.ptr<i32> {llvm.noundef})
+llvm.func @noundefattr_ret_decl() -> (!llvm.ptr {llvm.noundef})
 
 // CHECK-LABEL: declare signext i1 @signextattr_ret_decl()
 llvm.func @signextattr_ret_decl() -> (i1 {llvm.signext})
@@ -1252,8 +1242,8 @@ llvm.func @indirect_varargs_call(%arg0 : !llvm.ptr, %arg1 : i32) {
 llvm.func @intpointerconversion(%arg0 : i32) -> i32 {
 // CHECK:      %2 = inttoptr i32 %0 to ptr
 // CHECK-NEXT: %3 = ptrtoint ptr %2 to i32
-  %1 = llvm.inttoptr %arg0 : i32 to !llvm.ptr<i32>
-  %2 = llvm.ptrtoint %1 : !llvm.ptr<i32> to i32
+  %1 = llvm.inttoptr %arg0 : i32 to !llvm.ptr
+  %2 = llvm.ptrtoint %1 : !llvm.ptr to i32
   llvm.return %2 : i32
 }
 
@@ -1270,10 +1260,10 @@ llvm.func @fpconversion(%arg0 : i32) -> i32 {
 }
 
 // CHECK-LABEL: @addrspace
-llvm.func @addrspace(%arg0 : !llvm.ptr<i32>) -> !llvm.ptr<i32, 2> {
+llvm.func @addrspace(%arg0 : !llvm.ptr) -> !llvm.ptr<2> {
 // CHECK: %2 = addrspacecast ptr %0 to ptr addrspace(2)
-  %1 = llvm.addrspacecast %arg0 : !llvm.ptr<i32> to !llvm.ptr<i32, 2>
-  llvm.return %1 : !llvm.ptr<i32, 2>
+  %1 = llvm.addrspacecast %arg0 : !llvm.ptr to !llvm.ptr<2>
+  llvm.return %1 : !llvm.ptr<2>
 }
 
 llvm.func @stringconstant() -> !llvm.array<12 x i8> {
@@ -1403,11 +1393,11 @@ llvm.func @alloca(%size : i64) {
   // Alignment automatically set by the LLVM IR builder when alignment attribute
   // is 0.
   //  CHECK: alloca {{.*}} align 4
-  llvm.alloca %size x i32 {alignment = 0} : (i64) -> (!llvm.ptr<i32>)
+  llvm.alloca %size x i32 {alignment = 0} : (i64) -> (!llvm.ptr)
   // CHECK-NEXT: alloca {{.*}} align 8
-  llvm.alloca %size x i32 {alignment = 8} : (i64) -> (!llvm.ptr<i32>)
+  llvm.alloca %size x i32 {alignment = 8} : (i64) -> (!llvm.ptr)
   // CHECK-NEXT: alloca {{.*}} addrspace(3)
-  llvm.alloca %size x i32 {alignment = 0} : (i64) -> (!llvm.ptr<i32, 3>)
+  llvm.alloca %size x i32 {alignment = 0} : (i64) -> (!llvm.ptr<3>)
   // CHECK-NEXT: alloca inalloca {{.*}} align 4
   llvm.alloca inalloca %size x i32 : (i64) -> !llvm.ptr
   llvm.return
@@ -1444,10 +1434,10 @@ llvm.func @integer_extension_and_truncation(%a : i32) {
 
 // Check that the auxiliary `null` operation is converted into a `null` value.
 // CHECK-LABEL: @null
-llvm.func @null() -> !llvm.ptr<i32> {
-  %0 = llvm.mlir.zero : !llvm.ptr<i32>
+llvm.func @null() -> !llvm.ptr {
+  %0 = llvm.mlir.zero : !llvm.ptr
   // CHECK: ret ptr null
-  llvm.return %0 : !llvm.ptr<i32>
+  llvm.return %0 : !llvm.ptr
 }
 
 // Check that dense elements attributes are exported properly in constants.
@@ -1471,54 +1461,54 @@ llvm.func @elements_constant_3d_array() -> !llvm.array<2 x array<2 x array<2 x i
 
 // CHECK-LABEL: @atomicrmw
 llvm.func @atomicrmw(
-    %f32_ptr : !llvm.ptr<f32>, %f32 : f32,
-    %i32_ptr : !llvm.ptr<i32>, %i32 : i32) {
+    %f32_ptr : !llvm.ptr, %f32 : f32,
+    %i32_ptr : !llvm.ptr, %i32 : i32) {
   // CHECK: atomicrmw fadd ptr %{{.*}}, float %{{.*}} monotonic
-  %0 = llvm.atomicrmw fadd %f32_ptr, %f32 monotonic : !llvm.ptr<f32>, f32
+  %0 = llvm.atomicrmw fadd %f32_ptr, %f32 monotonic : !llvm.ptr, f32
   // CHECK: atomicrmw fsub ptr %{{.*}}, float %{{.*}} monotonic
-  %1 = llvm.atomicrmw fsub %f32_ptr, %f32 monotonic : !llvm.ptr<f32>, f32
+  %1 = llvm.atomicrmw fsub %f32_ptr, %f32 monotonic : !llvm.ptr, f32
   // CHECK: atomicrmw fmax ptr %{{.*}}, float %{{.*}} monotonic
-  %2 = llvm.atomicrmw fmax %f32_ptr, %f32 monotonic : !llvm.ptr<f32>, f32
+  %2 = llvm.atomicrmw fmax %f32_ptr, %f32 monotonic : !llvm.ptr, f32
   // CHECK: atomicrmw fmin ptr %{{.*}}, float %{{.*}} monotonic
-  %3 = llvm.atomicrmw fmin %f32_ptr, %f32 monotonic : !llvm.ptr<f32>, f32
+  %3 = llvm.atomicrmw fmin %f32_ptr, %f32 monotonic : !llvm.ptr, f32
   // CHECK: atomicrmw xchg ptr %{{.*}}, float %{{.*}} monotonic
-  %4 = llvm.atomicrmw xchg %f32_ptr, %f32 monotonic : !llvm.ptr<f32>, f32
+  %4 = llvm.atomicrmw xchg %f32_ptr, %f32 monotonic : !llvm.ptr, f32
   // CHECK: atomicrmw add ptr %{{.*}}, i32 %{{.*}} acquire
-  %5 = llvm.atomicrmw add %i32_ptr, %i32 acquire : !llvm.ptr<i32>, i32
+  %5 = llvm.atomicrmw add %i32_ptr, %i32 acquire : !llvm.ptr, i32
   // CHECK: atomicrmw sub ptr %{{.*}}, i32 %{{.*}} release
-  %6 = llvm.atomicrmw sub %i32_ptr, %i32 release : !llvm.ptr<i32>, i32
+  %6 = llvm.atomicrmw sub %i32_ptr, %i32 release : !llvm.ptr, i32
   // CHECK: atomicrmw and ptr %{{.*}}, i32 %{{.*}} acq_rel
-  %7 = llvm.atomicrmw _and %i32_ptr, %i32 acq_rel : !llvm.ptr<i32>, i32
+  %7 = llvm.atomicrmw _and %i32_ptr, %i32 acq_rel : !llvm.ptr, i32
   // CHECK: atomicrmw nand ptr %{{.*}}, i32 %{{.*}} seq_cst
-  %8 = llvm.atomicrmw nand %i32_ptr, %i32 seq_cst : !llvm.ptr<i32>, i32
+  %8 = llvm.atomicrmw nand %i32_ptr, %i32 seq_cst : !llvm.ptr, i32
   // CHECK: atomicrmw or ptr %{{.*}}, i32 %{{.*}} monotonic
-  %9 = llvm.atomicrmw _or %i32_ptr, %i32 monotonic : !llvm.ptr<i32>, i32
+  %9 = llvm.atomicrmw _or %i32_ptr, %i32 monotonic : !llvm.ptr, i32
   // CHECK: atomicrmw xor ptr %{{.*}}, i32 %{{.*}} monotonic
-  %10 = llvm.atomicrmw _xor %i32_ptr, %i32 monotonic : !llvm.ptr<i32>, i32
+  %10 = llvm.atomicrmw _xor %i32_ptr, %i32 monotonic : !llvm.ptr, i32
   // CHECK: atomicrmw max ptr %{{.*}}, i32 %{{.*}} monotonic
-  %11 = llvm.atomicrmw max %i32_ptr, %i32 monotonic : !llvm.ptr<i32>, i32
+  %11 = llvm.atomicrmw max %i32_ptr, %i32 monotonic : !llvm.ptr, i32
   // CHECK: atomicrmw min ptr %{{.*}}, i32 %{{.*}} monotonic
-  %12 = llvm.atomicrmw min %i32_ptr, %i32 monotonic : !llvm.ptr<i32>, i32
+  %12 = llvm.atomicrmw min %i32_ptr, %i32 monotonic : !llvm.ptr, i32
   // CHECK: atomicrmw umax ptr %{{.*}}, i32 %{{.*}} monotonic
-  %13 = llvm.atomicrmw umax %i32_ptr, %i32 monotonic : !llvm.ptr<i32>, i32
+  %13 = llvm.atomicrmw umax %i32_ptr, %i32 monotonic : !llvm.ptr, i32
   // CHECK: atomicrmw umin ptr %{{.*}}, i32 %{{.*}} monotonic
-  %14 = llvm.atomicrmw umin %i32_ptr, %i32 monotonic : !llvm.ptr<i32>, i32
+  %14 = llvm.atomicrmw umin %i32_ptr, %i32 monotonic : !llvm.ptr, i32
   // CHECK: atomicrmw uinc_wrap ptr %{{.*}}, i32 %{{.*}} monotonic
-  %15 = llvm.atomicrmw uinc_wrap %i32_ptr, %i32 monotonic : !llvm.ptr<i32>, i32
+  %15 = llvm.atomicrmw uinc_wrap %i32_ptr, %i32 monotonic : !llvm.ptr, i32
   // CHECK: atomicrmw udec_wrap ptr %{{.*}}, i32 %{{.*}} monotonic
-  %16 = llvm.atomicrmw udec_wrap %i32_ptr, %i32 monotonic : !llvm.ptr<i32>, i32
+  %16 = llvm.atomicrmw udec_wrap %i32_ptr, %i32 monotonic : !llvm.ptr, i32
 
   // CHECK: atomicrmw volatile
   // CHECK-SAME:  syncscope("singlethread")
   // CHECK-SAME:  align 8
-  %17 = llvm.atomicrmw volatile udec_wrap %i32_ptr, %i32 syncscope("singlethread") monotonic {alignment = 8 : i64} : !llvm.ptr<i32>, i32
+  %17 = llvm.atomicrmw volatile udec_wrap %i32_ptr, %i32 syncscope("singlethread") monotonic {alignment = 8 : i64} : !llvm.ptr, i32
   llvm.return
 }
 
 // CHECK-LABEL: @cmpxchg
-llvm.func @cmpxchg(%ptr : !llvm.ptr<i32>, %cmp : i32, %val: i32) {
+llvm.func @cmpxchg(%ptr : !llvm.ptr, %cmp : i32, %val: i32) {
   // CHECK: cmpxchg ptr %{{.*}}, i32 %{{.*}}, i32 %{{.*}} acq_rel monotonic
-  %0 = llvm.cmpxchg %ptr, %cmp, %val acq_rel monotonic : !llvm.ptr<i32>, i32
+  %0 = llvm.cmpxchg %ptr, %cmp, %val acq_rel monotonic : !llvm.ptr, i32
   // CHECK: %{{[0-9]+}} = extractvalue { i32, i1 } %{{[0-9]+}}, 0
   %1 = llvm.extractvalue %0[0] : !llvm.struct<(i32, i1)>
   // CHECK: %{{[0-9]+}} = extractvalue { i32, i1 } %{{[0-9]+}}, 1
@@ -1527,14 +1517,14 @@ llvm.func @cmpxchg(%ptr : !llvm.ptr<i32>, %cmp : i32, %val: i32) {
   // CHECK:  cmpxchg weak volatile
   // CHECK-SAME:  syncscope("singlethread")
   // CHECK-SAME:  align 8
-  %3 = llvm.cmpxchg weak volatile %ptr, %cmp, %val syncscope("singlethread") acq_rel monotonic {alignment = 8 : i64} : !llvm.ptr<i32>, i32
+  %3 = llvm.cmpxchg weak volatile %ptr, %cmp, %val syncscope("singlethread") acq_rel monotonic {alignment = 8 : i64} : !llvm.ptr, i32
   llvm.return
 }
 
-llvm.mlir.global external constant @_ZTIi() : !llvm.ptr<i8>
-llvm.func @foo(!llvm.ptr<i8>)
-llvm.func @vararg_foo(!llvm.ptr<i8>, ...)
-llvm.func @bar(!llvm.ptr<i8>) -> !llvm.ptr<i8>
+llvm.mlir.global external constant @_ZTIi() : !llvm.ptr
+llvm.func @foo(!llvm.ptr)
+llvm.func @vararg_foo(!llvm.ptr, ...)
+llvm.func @bar(!llvm.ptr) -> !llvm.ptr
 llvm.func @__gxx_personality_v0(...) -> i32
 
 // CHECK-LABEL: @invokeLandingpad
@@ -1542,14 +1532,13 @@ llvm.func @invokeLandingpad() -> i32 attributes { personality = @__gxx_personali
 // CHECK: %[[a1:[0-9]+]] = alloca i8
   %0 = llvm.mlir.constant(0 : i32) : i32
   %1 = llvm.mlir.constant(dense<0> : vector<1xi8>) : !llvm.array<1 x i8>
-  %2 = llvm.mlir.addressof @_ZTIi : !llvm.ptr<ptr<i8>>
-  %3 = llvm.bitcast %2 : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
-  %4 = llvm.mlir.zero : !llvm.ptr<ptr<i8>>
+  %2 = llvm.mlir.addressof @_ZTIi : !llvm.ptr
+  %4 = llvm.mlir.zero : !llvm.ptr
   %5 = llvm.mlir.constant(1 : i32) : i32
-  %6 = llvm.alloca %5 x i8 : (i32) -> !llvm.ptr<i8>
+  %6 = llvm.alloca %5 x i8 : (i32) -> !llvm.ptr
 // CHECK: invoke void @foo(ptr %[[a1]])
 // CHECK-NEXT: to label %[[normal:[0-9]+]] unwind label %[[unwind:[0-9]+]]
-  llvm.invoke @foo(%6) to ^bb2 unwind ^bb1 : (!llvm.ptr<i8>) -> ()
+  llvm.invoke @foo(%6) to ^bb2 unwind ^bb1 : (!llvm.ptr) -> ()
 
 // CHECK: [[unwind]]:
 ^bb1:
@@ -1557,7 +1546,7 @@ llvm.func @invokeLandingpad() -> i32 attributes { personality = @__gxx_personali
 // CHECK-NEXT:             catch ptr null
 // CHECK-NEXT:             catch ptr @_ZTIi
 // CHECK-NEXT:             filter [1 x i8] zeroinitializer
-  %7 = llvm.landingpad (catch %4 : !llvm.ptr<ptr<i8>>) (catch %3 : !llvm.ptr<i8>) (filter %1 : !llvm.array<1 x i8>) : !llvm.struct<(ptr<i8>, i32)>
+  %7 = llvm.landingpad (catch %4 : !llvm.ptr) (catch %2 : !llvm.ptr) (filter %1 : !llvm.array<1 x i8>) : !llvm.struct<(ptr, i32)>
 // CHECK: br label %[[final:[0-9]+]]
   llvm.br ^bb3
 
@@ -1570,18 +1559,18 @@ llvm.func @invokeLandingpad() -> i32 attributes { personality = @__gxx_personali
 // CHECK-NEXT: %{{[0-9]+}} = invoke ptr @bar(ptr %[[a1]])
 // CHECK-NEXT:          to label %[[normal]] unwind label %[[unwind]]
 ^bb3:	// pred: ^bb1
-  %8 = llvm.invoke @bar(%6) to ^bb2 unwind ^bb1 : (!llvm.ptr<i8>) -> !llvm.ptr<i8>
+  %8 = llvm.invoke @bar(%6) to ^bb2 unwind ^bb1 : (!llvm.ptr) -> !llvm.ptr
 
 // CHECK: [[BB4:.*]]:
 // CHECK: invoke void (ptr, ...) @vararg_foo(ptr %[[a1]], i32 0)
 ^bb4:
-  llvm.invoke @vararg_foo(%6, %0) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (ptr<i8>, ...)>) : (!llvm.ptr<i8>, i32) -> ()
+  llvm.invoke @vararg_foo(%6, %0) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (ptr, ...)>) : (!llvm.ptr, i32) -> ()
 
 // CHECK: [[BB5:.*]]:
 // CHECK: invoke void (ptr, ...) undef(ptr %[[a1]], i32 0)
 ^bb5:
   %9 = llvm.mlir.undef : !llvm.ptr
-  llvm.invoke %9(%6, %0) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (ptr<i8>, ...)>) : !llvm.ptr, (!llvm.ptr<i8>, i32) -> ()
+  llvm.invoke %9(%6, %0) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (ptr, ...)>) : !llvm.ptr, (!llvm.ptr, i32) -> ()
 }
 
 // -----
@@ -1591,7 +1580,7 @@ llvm.func @__gxx_personality_v0(...) -> i32
 
 // CHECK-LABEL: @invoke_result
 // CHECK-SAME: %[[a0:[0-9]+]]
-llvm.func @invoke_result(%arg0 : !llvm.ptr<i8>) attributes { personality = @__gxx_personality_v0 } {
+llvm.func @invoke_result(%arg0 : !llvm.ptr) attributes { personality = @__gxx_personality_v0 } {
 // CHECK: %[[a1:[0-9]+]] = invoke i8 @foo()
 // CHECK-NEXT: to label %[[normal:[0-9]+]] unwind label %[[unwind:[0-9]+]]
     %0 = llvm.invoke @foo() to ^bb1 unwind ^bb2 : () -> i8
@@ -1600,7 +1589,7 @@ llvm.func @invoke_result(%arg0 : !llvm.ptr<i8>) attributes { personality = @__gx
 // CHECK-NEXT: store i8 %[[a1]], ptr %[[a0]]
 // CHECK-NEXT: ret void
 ^bb1:
-    llvm.store %0, %arg0 : !llvm.ptr<i8>
+    llvm.store %0, %arg0 : i8, !llvm.ptr
     llvm.return
 
 // CHECK: [[unwind]]:
@@ -1608,7 +1597,7 @@ llvm.func @invoke_result(%arg0 : !llvm.ptr<i8>) attributes { personality = @__gx
 // CHECK-NEXT: cleanup
 // CHECK-NEXT: ret void
 ^bb2:
-    %7 = llvm.landingpad cleanup : !llvm.struct<(ptr<i8>, i32)>
+    %7 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
     llvm.return
 }
 
@@ -1635,7 +1624,7 @@ llvm.func @invoke_phis() -> i32 attributes { personality = @__gxx_personality_v0
 // CHECK-NEXT: cleanup
 // CHECK-NEXT: br label %[[normal]]
 ^bb2:
-    %2 = llvm.landingpad cleanup : !llvm.struct<(ptr<i8>, i32)>
+    %2 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
     %3 = llvm.mlir.constant(1 : i32) : i32
     llvm.br ^bb1(%3 : i32)
 }
@@ -1782,17 +1771,17 @@ llvm.func @address_taken() {
   llvm.return
 }
 
-llvm.mlir.global internal constant @taker_of_address() : !llvm.ptr<func<void ()>> {
-  %0 = llvm.mlir.addressof @address_taken : !llvm.ptr<func<void ()>>
-  llvm.return %0 : !llvm.ptr<func<void ()>>
+llvm.mlir.global internal constant @taker_of_address() : !llvm.ptr {
+  %0 = llvm.mlir.addressof @address_taken : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
 }
 
 // -----
 
 // CHECK: @forward_use_of_address = linkonce global ptr @address_declared_after_use
-llvm.mlir.global linkonce @forward_use_of_address() : !llvm.ptr<f32> {
-  %0 = llvm.mlir.addressof @address_declared_after_use : !llvm.ptr<f32>
-  llvm.return %0 : !llvm.ptr<f32>
+llvm.mlir.global linkonce @forward_use_of_address() : !llvm.ptr {
+  %0 = llvm.mlir.addressof @address_declared_after_use : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
 }
 
 llvm.mlir.global linkonce @address_declared_after_use() : f32
@@ -1800,14 +1789,14 @@ llvm.mlir.global linkonce @address_declared_after_use() : f32
 // -----
 
 // CHECK: @take_self_address = linkonce global { i32, ptr } {{.*}} ptr @take_self_address
-llvm.mlir.global linkonce @take_self_address() : !llvm.struct<(i32, !llvm.ptr<i32>)> {
+llvm.mlir.global linkonce @take_self_address() : !llvm.struct<(i32, !llvm.ptr)> {
   %z32 = llvm.mlir.constant(0 : i32) : i32
-  %0 = llvm.mlir.undef : !llvm.struct<(i32, !llvm.ptr<i32>)>
-  %1 = llvm.mlir.addressof @take_self_address : !llvm.ptr<!llvm.struct<(i32, !llvm.ptr<i32>)>>
-  %2 = llvm.getelementptr %1[%z32, 0] : (!llvm.ptr<!llvm.struct<(i32, !llvm.ptr<i32>)>>, i32) -> !llvm.ptr<i32>
-  %3 = llvm.insertvalue %z32, %0[0] : !llvm.struct<(i32, !llvm.ptr<i32>)>
-  %4 = llvm.insertvalue %2, %3[1] : !llvm.struct<(i32, !llvm.ptr<i32>)>
-  llvm.return %4 : !llvm.struct<(i32, !llvm.ptr<i32>)>
+  %0 = llvm.mlir.undef : !llvm.struct<(i32, !llvm.ptr)>
+  %1 = llvm.mlir.addressof @take_self_address : !llvm.ptr
+  %2 = llvm.getelementptr %1[%z32, 0] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.struct<(i32, !llvm.ptr)>
+  %3 = llvm.insertvalue %z32, %0[0] : !llvm.struct<(i32, !llvm.ptr)>
+  %4 = llvm.insertvalue %2, %3[1] : !llvm.struct<(i32, !llvm.ptr)>
+  llvm.return %4 : !llvm.struct<(i32, !llvm.ptr)>
 }
 
 // -----
@@ -1879,7 +1868,7 @@ llvm.func @invoke_branch_weights() -> i32 attributes {personality = @__gxx_perso
   // CHECK: !prof ![[NODE:[0-9]+]]
   llvm.invoke @foo() to ^bb2 unwind ^bb1 {branch_weights = array<i32 : 42, 99>} : () -> ()
 ^bb1:  // pred: ^bb0
-  %1 = llvm.landingpad cleanup : !llvm.struct<(ptr<i8>, i32)>
+  %1 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
   llvm.br ^bb2
 ^bb2:  // 2 preds: ^bb0, ^bb1
   llvm.return %0 : i32
@@ -1892,11 +1881,11 @@ llvm.func @invoke_branch_weights() -> i32 attributes {personality = @__gxx_perso
 llvm.func @volatile_store_and_load() {
   %val = llvm.mlir.constant(5 : i32) : i32
   %size = llvm.mlir.constant(1 : i64) : i64
-  %0 = llvm.alloca %size x i32 : (i64) -> (!llvm.ptr<i32>)
+  %0 = llvm.alloca %size x i32 : (i64) -> (!llvm.ptr)
   // CHECK: store volatile i32 5, ptr %{{.*}}
-  llvm.store volatile %val, %0 : !llvm.ptr<i32>
+  llvm.store volatile %val, %0 : i32, !llvm.ptr
   // CHECK: %{{.*}} = load volatile i32, ptr %{{.*}}
-  %1 = llvm.load volatile %0: !llvm.ptr<i32>
+  %1 = llvm.load volatile %0: !llvm.ptr -> i32
   llvm.return
 }
 
@@ -1906,11 +1895,11 @@ llvm.func @volatile_store_and_load() {
 llvm.func @nontemporal_store_and_load() {
   %val = llvm.mlir.constant(5 : i32) : i32
   %size = llvm.mlir.constant(1 : i64) : i64
-  %0 = llvm.alloca %size x i32 : (i64) -> (!llvm.ptr<i32>)
+  %0 = llvm.alloca %size x i32 : (i64) -> (!llvm.ptr)
   // CHECK: !nontemporal ![[NODE:[0-9]+]]
-  llvm.store %val, %0 {nontemporal} : !llvm.ptr<i32>
+  llvm.store %val, %0 {nontemporal} : i32, !llvm.ptr
   // CHECK: !nontemporal ![[NODE]]
-  %1 = llvm.load %0 {nontemporal} : !llvm.ptr<i32>
+  %1 = llvm.load %0 {nontemporal} : !llvm.ptr -> i32
   llvm.return
 }
 
@@ -2246,19 +2235,17 @@ llvm.func @vararg_function(%arg0: i32, ...) {
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA0:.+]] = alloca %struct.va_list, align 8
-  %2 = llvm.alloca %1 x !llvm.struct<"struct.va_list", (ptr<i8>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>>
-  %3 = llvm.bitcast %2 : !llvm.ptr<struct<"struct.va_list", (ptr<i8>)>> to !llvm.ptr<i8>
+  %2 = llvm.alloca %1 x !llvm.struct<"struct.va_list", (ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
   // CHECK: call void @llvm.va_start(ptr %[[ALLOCA0]])
-  llvm.intr.vastart %3 : !llvm.ptr<i8>
+  llvm.intr.vastart %2 : !llvm.ptr
   // CHECK: %[[ALLOCA1:.+]] = alloca ptr, align 8
-  %4 = llvm.alloca %0 x !llvm.ptr<i8> {alignment = 8 : i64} : (i32) -> !llvm.ptr<ptr<i8>>
-  %5 = llvm.bitcast %4 : !llvm.ptr<ptr<i8>> to !llvm.ptr<i8>
+  %4 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
   // CHECK: call void @llvm.va_copy(ptr %[[ALLOCA1]], ptr %[[ALLOCA0]])
-  llvm.intr.vacopy %3 to %5 : !llvm.ptr<i8>, !llvm.ptr<i8>
+  llvm.intr.vacopy %2 to %4 : !llvm.ptr, !llvm.ptr
   // CHECK: call void @llvm.va_end(ptr %[[ALLOCA1]])
   // CHECK: call void @llvm.va_end(ptr %[[ALLOCA0]])
-  llvm.intr.vaend %5 : !llvm.ptr<i8>
-  llvm.intr.vaend %3 : !llvm.ptr<i8>
+  llvm.intr.vaend %4 : !llvm.ptr
+  llvm.intr.vaend %2 : !llvm.ptr
   // CHECK: ret void
   llvm.return
 }
@@ -2266,7 +2253,7 @@ llvm.func @vararg_function(%arg0: i32, ...) {
 // -----
 
 // CHECK: declare void @readonly_function([[PTR:.+]] readonly)
-llvm.func @readonly_function(%arg0: !llvm.ptr<f32> {llvm.readonly})
+llvm.func @readonly_function(%arg0: !llvm.ptr {llvm.readonly})
 
 // -----
 
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 24ef1198577937f..76540cc2c3973c3 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -268,11 +268,11 @@ llvm.func @nvvm_mma_m16n8k4_tf32_f32(%a0 : i32, %a1 : i32,
 // The test below checks the correct mapping of the nvvm.wmma.*.load.* op to the correct intrinsic
 // in the LLVM NVPTX backend.
 // CHECK-LABEL: @gpu_wmma_load_op
-llvm.func @gpu_wmma_load_op(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
+llvm.func @gpu_wmma_load_op(%arg0: !llvm.ptr<3>, %arg1: i32) {
   // CHECK: call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p3(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
   %0 = nvvm.wmma.load %arg0, %arg1
     {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
-    : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+    : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
 
   llvm.return
 }
@@ -280,13 +280,13 @@ llvm.func @gpu_wmma_load_op(%arg0: !llvm.ptr<i32, 3>, %arg1: i32) {
 // The test below checks the correct mapping of the nvvm.wmma.*.store.* op to the correct intrinsic
 // in the LLVM NVPTX backend.
 // CHECK-LABEL: @gpu_wmma_store_op
-llvm.func @gpu_wmma_store_op(%arg0: !llvm.ptr<i32, 3>, %arg1: i32,
+llvm.func @gpu_wmma_store_op(%arg0: !llvm.ptr<3>, %arg1: i32,
                             %arg2: vector<2 x f16>, %arg3: vector<2 x f16>,
                             %arg4: vector<2 xf16>, %arg5: vector<2 x f16>) {
   // CHECK: call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p3(ptr addrspace(3) %{{.*}}, <2 x half> {{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, <2 x half> %{{.*}}, i32 %{{.*}})
   nvvm.wmma.store %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
     {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
-    : !llvm.ptr<i32, 3>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
+    : !llvm.ptr<3>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>, vector<2 x f16>
   llvm.return
 }
 
@@ -315,11 +315,11 @@ llvm.func @gpu_wmma_mma_op(%arg0: vector<2 x f16>, %arg1: vector<2 x f16>,
 }
 
 // CHECK-LABEL: @nvvm_wmma_load_tf32
-llvm.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr<i32>, %arg1 : i32) {
+llvm.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr, %arg1 : i32) {
   // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %{{.*}}, i32 %{{.*}})
   %0 = nvvm.wmma.load %arg0, %arg1
     {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
-    : (!llvm.ptr<i32>) -> !llvm.struct<(i32, i32, i32, i32)>
+    : (!llvm.ptr) -> !llvm.struct<(i32, i32, i32, i32)>
   llvm.return
 }
 
@@ -336,15 +336,15 @@ llvm.func @nvvm_wmma_mma(%0 : i32, %1 : i32, %2 : i32, %3 : i32, %4 : i32, %5 :
 }
 
 // CHECK-LABEL: @cp_async
-llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
+llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
 // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
-  nvvm.cp.async.shared.global %arg0, %arg1, 4, cache =  ca : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  nvvm.cp.async.shared.global %arg0, %arg1, 4, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
 // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
-  nvvm.cp.async.shared.global %arg0, %arg1, 8, cache =  ca : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  nvvm.cp.async.shared.global %arg0, %arg1, 8, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
 // CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
-  nvvm.cp.async.shared.global %arg0, %arg1, 16, cache =  ca : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  nvvm.cp.async.shared.global %arg0, %arg1, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
 // CHECK: call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %{{.*}}, ptr addrspace(1) %{{.*}})
-  nvvm.cp.async.shared.global %arg0, %arg1, 16, cache =  cg : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
+  nvvm.cp.async.shared.global %arg0, %arg1, 16, cache =  cg : !llvm.ptr<3>, !llvm.ptr<1>
 // CHECK: call void @llvm.nvvm.cp.async.commit.group()
   nvvm.cp.async.commit.group
 // CHECK: call void @llvm.nvvm.cp.async.wait.group(i32 0)
@@ -353,19 +353,19 @@ llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
 }
 
 // CHECK-LABEL: @ld_matrix
-llvm.func @ld_matrix(%arg0: !llvm.ptr<i32, 3>) {
+llvm.func @ld_matrix(%arg0: !llvm.ptr<3>) {
   // CHECK: call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16.p3(ptr addrspace(3) %{{.*}})
-  %l1 = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> i32
+  %l1 = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> i32
   // CHECK: call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.b16.p3(ptr addrspace(3) %{{.*}})
-  %l2 = nvvm.ldmatrix %arg0 {num = 2 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>
+  %l2 = nvvm.ldmatrix %arg0 {num = 2 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
   // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16.p3(ptr addrspace(3) %{{.*}})
-  %l4 = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32, i32, i32)>
+  %l4 = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
    // CHECK: call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.trans.b16.p3(ptr addrspace(3) %{{.*}})
-  %l1t = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<col>} : (!llvm.ptr<i32, 3>) -> i32
+  %l1t = nvvm.ldmatrix %arg0 {num = 1 : i32, layout = #nvvm.mma_layout<col>} : (!llvm.ptr<3>) -> i32
   // CHECK: call { i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.trans.b16.p3(ptr addrspace(3) %{{.*}})
-  %l2t = nvvm.ldmatrix %arg0 {num = 2 : i32, layout = #nvvm.mma_layout<col>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>
+  %l2t = nvvm.ldmatrix %arg0 {num = 2 : i32, layout = #nvvm.mma_layout<col>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
   // CHECK: call { i32, i32, i32, i32 } @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16.p3(ptr addrspace(3) %{{.*}})
-  %l4t = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<col>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32, i32, i32)>
+  %l4t = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<col>} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/openacc-llvm.mlir b/mlir/test/Target/LLVMIR/openacc-llvm.mlir
index 0954d929326c29b..897311c6e81beae 100644
--- a/mlir/test/Target/LLVMIR/openacc-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openacc-llvm.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
 
-llvm.func @testenterdataop(%arg0: !llvm.ptr<f32>, %arg1 : !llvm.ptr<f32>) {
-  %0 = acc.create varPtr(%arg0 : !llvm.ptr<f32>) -> !llvm.ptr<f32>
-  %1 = acc.copyin varPtr(%arg1 : !llvm.ptr<f32>) -> !llvm.ptr<f32>
-  acc.enter_data dataOperands(%0, %1 : !llvm.ptr<f32>, !llvm.ptr<f32>)
+llvm.func @testenterdataop(%arg0: !llvm.ptr, %arg1 : !llvm.ptr) {
+  %0 = acc.create varPtr(%arg0 : !llvm.ptr) -> !llvm.ptr
+  %1 = acc.copyin varPtr(%arg1 : !llvm.ptr) -> !llvm.ptr
+  acc.enter_data dataOperands(%0, %1 : !llvm.ptr, !llvm.ptr)
   llvm.return
 }
 
@@ -47,12 +47,12 @@ llvm.func @testenterdataop(%arg0: !llvm.ptr<f32>, %arg1 : !llvm.ptr<f32>) {
 // -----
 
 
-llvm.func @testexitdataop(%arg0: !llvm.ptr<f32>, %arg1: !llvm.ptr<f32>) {
-  %arg0_devptr = acc.getdeviceptr varPtr(%arg0 : !llvm.ptr<f32>) -> !llvm.ptr<f32>
-  %1 = acc.getdeviceptr varPtr(%arg1 : !llvm.ptr<f32>) -> !llvm.ptr<f32>
-  acc.exit_data dataOperands(%arg0_devptr, %1 : !llvm.ptr<f32>, !llvm.ptr<f32>)
-  acc.delete accPtr(%arg0_devptr : !llvm.ptr<f32>)
-  acc.copyout accPtr(%1 : !llvm.ptr<f32>) to varPtr(%arg1 : !llvm.ptr<f32>)
+llvm.func @testexitdataop(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  %arg0_devptr = acc.getdeviceptr varPtr(%arg0 : !llvm.ptr) -> !llvm.ptr
+  %1 = acc.getdeviceptr varPtr(%arg1 : !llvm.ptr) -> !llvm.ptr
+  acc.exit_data dataOperands(%arg0_devptr, %1 : !llvm.ptr, !llvm.ptr)
+  acc.delete accPtr(%arg0_devptr : !llvm.ptr)
+  acc.copyout accPtr(%1 : !llvm.ptr) to varPtr(%arg1 : !llvm.ptr)
   llvm.return
 }
 
@@ -94,9 +94,9 @@ llvm.func @testexitdataop(%arg0: !llvm.ptr<f32>, %arg1: !llvm.ptr<f32>) {
 
 // -----
 
-llvm.func @testupdateop(%arg1: !llvm.ptr<f32>) {
-  %0 = acc.update_device varPtr(%arg1 : !llvm.ptr<f32>) -> !llvm.ptr<f32>
-  acc.update dataOperands(%0 : !llvm.ptr<f32>)
+llvm.func @testupdateop(%arg1: !llvm.ptr) {
+  %0 = acc.update_device varPtr(%arg1 : !llvm.ptr) -> !llvm.ptr
+  acc.update dataOperands(%0 : !llvm.ptr)
   llvm.return
 }
 
@@ -130,17 +130,17 @@ llvm.func @testupdateop(%arg1: !llvm.ptr<f32>) {
 
 // -----
 
-llvm.func @testdataop(%arg0: !llvm.ptr<f32>, %arg1: !llvm.ptr<f32>, %arg2: !llvm.ptr<i32>) {
+llvm.func @testdataop(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) {
   
-  %0 = acc.copyin varPtr(%arg0 : !llvm.ptr<f32>) -> !llvm.ptr<f32>
-  %1 = acc.create varPtr(%arg1 : !llvm.ptr<f32>) -> !llvm.ptr<f32>
-  acc.data dataOperands(%0, %1 : !llvm.ptr<f32>, !llvm.ptr<f32>) {
+  %0 = acc.copyin varPtr(%arg0 : !llvm.ptr) -> !llvm.ptr
+  %1 = acc.create varPtr(%arg1 : !llvm.ptr) -> !llvm.ptr
+  acc.data dataOperands(%0, %1 : !llvm.ptr, !llvm.ptr) {
     %9 = llvm.mlir.constant(2 : i32) : i32
-    llvm.store %9, %arg2 : !llvm.ptr<i32>
+    llvm.store %9, %arg2 : i32, !llvm.ptr
     acc.terminator
   }
-  acc.copyout accPtr(%0 : !llvm.ptr<f32>) to varPtr(%arg0 : !llvm.ptr<f32>)
-  acc.copyout accPtr(%1 : !llvm.ptr<f32>) to varPtr(%arg1 : !llvm.ptr<f32>)
+  acc.copyout accPtr(%0 : !llvm.ptr) to varPtr(%arg0 : !llvm.ptr)
+  acc.copyout accPtr(%1 : !llvm.ptr) to varPtr(%arg1 : !llvm.ptr)
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/openmp-nested.mlir b/mlir/test/Target/LLVMIR/openmp-nested.mlir
index d83a78cfbd589b7..e1fdfdd24a3cb06 100644
--- a/mlir/test/Target/LLVMIR/openmp-nested.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-nested.mlir
@@ -23,7 +23,7 @@ module {
         %20 = llvm.trunc %19 : i64 to i32
         %5 = llvm.mlir.addressof @str0 : !llvm.ptr
         %6 = llvm.getelementptr %5[%4, %4] : (!llvm.ptr, i32, i32) -> !llvm.ptr, !llvm.array<29 x i8>
-        %21 = llvm.call @printf(%6, %20, %20) vararg(!llvm.func<i32 (ptr<i8>, ...)>): (!llvm.ptr, i32, i32) -> i32
+        %21 = llvm.call @printf(%6, %20, %20) vararg(!llvm.func<i32 (ptr, ...)>): (!llvm.ptr, i32, i32) -> i32
         omp.yield
       }
       omp.terminator

>From 183f5094ff7da09beed46f760a857af449a24245 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mamini at nvidia.com>
Date: Fri, 3 Nov 2023 13:26:56 -0700
Subject: [PATCH 76/76] use cached TM

Created using spr 1.3.4
---
 mlir/lib/Target/LLVM/ModuleToObject.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Target/LLVM/ModuleToObject.cpp b/mlir/lib/Target/LLVM/ModuleToObject.cpp
index 6af3d49ab23bf74..d94c10de8d7c424 100644
--- a/mlir/lib/Target/LLVM/ModuleToObject.cpp
+++ b/mlir/lib/Target/LLVM/ModuleToObject.cpp
@@ -45,8 +45,10 @@ Operation &ModuleToObject::getOperation() { return module; }
 
 std::optional<llvm::TargetMachine *>
 ModuleToObject::getOrCreateTargetMachine() {
-  std::string error;
+  if (targetMachine)
+    return targetMachine.get();
   // Load the target.
+  std::string error;
   const llvm::Target *target =
       llvm::TargetRegistry::lookupTarget(triple, error);
   if (!target) {