[llvm-branch-commits] [clang] [compiler-rt] [flang] [libc] [libcxx] [lld] [lldb] [llvm] [mlir] [openmp] [NFC][IR] Add SetNoSanitize helpers (PR #86772)

Vitaly Buka via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Mar 27 10:34:25 PDT 2024


https://github.com/vitalybuka updated https://github.com/llvm/llvm-project/pull/86772

>From aa2d5d54130bd9c5e9efb9ae3eaec631f227f13b Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Tue, 26 Mar 2024 23:09:09 -0700
Subject: [PATCH 01/54] Recommit "[RISCV][TTI] Scale the cost of the sext/zext
 with LMUL (#86617)"

Changes in Recommit:
  Add an additional check on sign/zero extend to the same type.

Original message:
  Use the destination data type to measure the LMUL size for
  latency/throughput cost
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  20 +-
 llvm/test/Analysis/CostModel/RISCV/cast.ll    | 920 +++++++++---------
 .../CostModel/RISCV/reduce-scalable-int.ll    |  12 +-
 .../CostModel/RISCV/rvv-extractelement.ll     |  84 +-
 .../CostModel/RISCV/rvv-insertelement.ll      |  84 +-
 .../CostModel/RISCV/shuffle-broadcast.ll      |   2 +-
 6 files changed, 566 insertions(+), 556 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 000d01b8366cd1..38cdf3c47c6420 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -909,23 +909,33 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (!IsTypeLegal)
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 
+  std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
+
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  // FIXME: Need to consider vsetvli and lmul.
   int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
                 (int)Log2_32(Src->getScalarSizeInBits());
   switch (ISD) {
   case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
-    if (Src->getScalarSizeInBits() == 1) {
+  case ISD::ZERO_EXTEND: {
+    const unsigned SrcEltSize = Src->getScalarSizeInBits();
+    if (SrcEltSize == 1) {
       // We do not use vsext/vzext to extend from mask vector.
       // Instead we use the following instructions to extend from mask vector:
       // vmv.v.i v8, 0
       // vmerge.vim v8, v8, -1, v0
-      return 2;
+      return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
+                                     DstLT.second, CostKind);
     }
-    return 1;
+    if ((PowDiff < 1) || (PowDiff > 3))
+      return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
+    unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
+    unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
+    unsigned Op =
+        (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
+    return getRISCVInstructionCost(Op, DstLT.second, CostKind);
+  }
   case ISD::TRUNCATE:
     if (Dst->getScalarSizeInBits() == 1) {
       // We do not use several vncvt to truncate to mask vector. So we could
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index bd26c19c2f2c3c..14da9a3f79d771 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -16,74 +16,74 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -96,73 +96,73 @@ define void @sext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -179,74 +179,74 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -259,73 +259,73 @@ define void @sext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -522,74 +522,74 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -602,73 +602,73 @@ define void @zext() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -685,74 +685,74 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
@@ -765,73 +765,73 @@ define void @zext() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
 ; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll
index 80efe912c86985..30cb32ce4eaf29 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-scalable-int.ll
@@ -1141,7 +1141,7 @@ define signext i32 @vreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
 
 define signext i32 @vwreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
 ; CHECK-LABEL: 'vwreduce_add_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 4 x i16> %v to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = sext <vscale x 4 x i16> %v to <vscale x 4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %red
 ;
@@ -1157,7 +1157,7 @@ define signext i32 @vwreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
 
 define signext i32 @vwreduce_uadd_nxv4i16(<vscale x 4 x i16> %v) {
 ; CHECK-LABEL: 'vwreduce_uadd_nxv4i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 4 x i16> %v to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = zext <vscale x 4 x i16> %v to <vscale x 4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %red
 ;
@@ -1445,7 +1445,7 @@ define i64 @vreduce_add_nxv2i64(<vscale x 2 x i64> %v) {
 
 define i64 @vwreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
 ; CHECK-LABEL: 'vwreduce_add_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %red
 ;
@@ -1461,7 +1461,7 @@ define i64 @vwreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
 
 define i64 @vwreduce_uadd_nxv2i32(<vscale x 2 x i32> %v) {
 ; CHECK-LABEL: 'vwreduce_uadd_nxv2i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e = zext <vscale x 2 x i32> %v to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %red
 ;
@@ -1597,7 +1597,7 @@ define i64 @vreduce_add_nxv4i64(<vscale x 4 x i64> %v) {
 
 define i64 @vwreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
 ; CHECK-LABEL: 'vwreduce_add_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = sext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e = sext <vscale x 4 x i32> %v to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %red
 ;
@@ -1613,7 +1613,7 @@ define i64 @vwreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
 
 define i64 @vwreduce_uadd_nxv4i32(<vscale x 4 x i32> %v) {
 ; CHECK-LABEL: 'vwreduce_uadd_nxv4i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = zext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e = zext <vscale x 4 x i32> %v to <vscale x 4 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %red
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll
index 225bad6da5915c..aa7a90bece3390 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-extractelement.ll
@@ -12,12 +12,12 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = extractelement <4 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_0 = extractelement <8 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_0 = extractelement <vscale x 2 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_0 = extractelement <vscale x 4 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_0 = extractelement <vscale x 8 x i1> undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
@@ -66,12 +66,12 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_1 = extractelement <vscale x 2 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_1 = extractelement <vscale x 4 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_1 = extractelement <vscale x 8 x i1> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = extractelement <4 x i8> undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = extractelement <8 x i8> undef, i32 1
@@ -120,12 +120,12 @@ define void @extractelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_x = extractelement <vscale x 2 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_x = extractelement <vscale x 4 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_x = extractelement <vscale x 8 x i1> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_x = extractelement <2 x i8> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_x = extractelement <4 x i8> undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_x = extractelement <8 x i8> undef, i32 %x
@@ -177,12 +177,12 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = extractelement <4 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_0 = extractelement <8 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_0 = extractelement <vscale x 2 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_0 = extractelement <vscale x 4 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_0 = extractelement <vscale x 8 x i1> undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
@@ -231,12 +231,12 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_1 = extractelement <vscale x 2 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_1 = extractelement <vscale x 4 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_1 = extractelement <vscale x 8 x i1> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = extractelement <4 x i8> undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = extractelement <8 x i8> undef, i32 1
@@ -285,12 +285,12 @@ define void @extractelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_x = extractelement <vscale x 2 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_x = extractelement <vscale x 4 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_x = extractelement <vscale x 8 x i1> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_x = extractelement <2 x i8> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_x = extractelement <4 x i8> undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_x = extractelement <8 x i8> undef, i32 %x
@@ -341,13 +341,13 @@ define void @extractelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_0 = extractelement <2 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = extractelement <4 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_0 = extractelement <8 x i1> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_0 = extractelement <vscale x 2 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_0 = extractelement <vscale x 4 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_0 = extractelement <vscale x 8 x i1> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
@@ -395,13 +395,13 @@ define void @extractelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_1 = extractelement <2 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_1 = extractelement <vscale x 2 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_1 = extractelement <vscale x 4 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_1 = extractelement <vscale x 8 x i1> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = extractelement <4 x i8> undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = extractelement <8 x i8> undef, i32 1
@@ -449,13 +449,13 @@ define void @extractelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_x = extractelement <vscale x 2 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_x = extractelement <vscale x 4 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_x = extractelement <vscale x 8 x i1> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_x = extractelement <2 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_x = extractelement <4 x i8> undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_x = extractelement <8 x i8> undef, i32 %x
@@ -506,13 +506,13 @@ define void @extractelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_0 = extractelement <2 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = extractelement <4 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_0 = extractelement <8 x i1> undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = extractelement <16 x i1> undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i1_0 = extractelement <32 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_0 = extractelement <vscale x 2 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_0 = extractelement <vscale x 4 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_0 = extractelement <vscale x 8 x i1> undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = extractelement <vscale x 16 x i1> undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i1_0 = extractelement <vscale x 32 x i1> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = extractelement <2 x i8> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = extractelement <4 x i8> undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = extractelement <8 x i8> undef, i32 0
@@ -560,13 +560,13 @@ define void @extractelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_1 = extractelement <2 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_1 = extractelement <4 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_1 = extractelement <8 x i1> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = extractelement <16 x i1> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v32i1_1 = extractelement <32 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_1 = extractelement <vscale x 2 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_1 = extractelement <vscale x 4 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_1 = extractelement <vscale x 8 x i1> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = extractelement <vscale x 16 x i1> undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_1 = extractelement <vscale x 32 x i1> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = extractelement <2 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = extractelement <4 x i8> undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = extractelement <8 x i8> undef, i32 1
@@ -614,13 +614,13 @@ define void @extractelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i1_x = extractelement <2 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_x = extractelement <4 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_x = extractelement <8 x i1> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_x = extractelement <16 x i1> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v32i1_x = extractelement <32 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_x = extractelement <vscale x 2 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_x = extractelement <vscale x 4 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_x = extractelement <vscale x 8 x i1> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_x = extractelement <vscale x 16 x i1> undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv32i1_x = extractelement <vscale x 32 x i1> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_x = extractelement <2 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_x = extractelement <4 x i8> undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_x = extractelement <8 x i8> undef, i32 %x
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll
index 5387c8dc35940e..6e1ae0216f7655 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-insertelement.ll
@@ -12,12 +12,12 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i1_0 = insertelement <4 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i1_0 = insertelement <8 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i1_0 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i1_0 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i1_0 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -66,12 +66,12 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv2i1_1 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i1_1 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i1_1 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = insertelement <2 x i8> undef, i8 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = insertelement <4 x i8> undef, i8 undef, i32 1
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = insertelement <8 x i8> undef, i8 undef, i32 1
@@ -120,12 +120,12 @@ define void @insertelement_int(i32 %x) {
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv2i1_x = insertelement <vscale x 2 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv4i1_x = insertelement <vscale x 4 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i1_x = insertelement <vscale x 8 x i1> undef, i1 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
-; RV32V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
+; RV32V-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_x = insertelement <2 x i8> undef, i8 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_x = insertelement <4 x i8> undef, i8 undef, i32 %x
 ; RV32V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_x = insertelement <8 x i8> undef, i8 undef, i32 %x
@@ -177,12 +177,12 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i1_0 = insertelement <4 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i1_0 = insertelement <8 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i1_0 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i1_0 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i1_0 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -231,12 +231,12 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv2i1_1 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i1_1 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i1_1 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = insertelement <2 x i8> undef, i8 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = insertelement <4 x i8> undef, i8 undef, i32 1
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = insertelement <8 x i8> undef, i8 undef, i32 1
@@ -285,12 +285,12 @@ define void @insertelement_int(i32 %x) {
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv2i1_x = insertelement <vscale x 2 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv4i1_x = insertelement <vscale x 4 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i1_x = insertelement <vscale x 8 x i1> undef, i1 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
-; RV64V-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
+; RV64V-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_x = insertelement <2 x i8> undef, i8 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_x = insertelement <4 x i8> undef, i8 undef, i32 %x
 ; RV64V-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_x = insertelement <8 x i8> undef, i8 undef, i32 %x
@@ -341,13 +341,13 @@ define void @insertelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i1_0 = insertelement <2 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i1_0 = insertelement <4 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i1_0 = insertelement <8 x i1> undef, i1 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i1_0 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i1_0 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i1_0 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -395,13 +395,13 @@ define void @insertelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1_1 = insertelement <2 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv2i1_1 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i1_1 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i1_1 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = insertelement <2 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = insertelement <4 x i8> undef, i8 undef, i32 1
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = insertelement <8 x i8> undef, i8 undef, i32 1
@@ -449,13 +449,13 @@ define void @insertelement_int(i32 %x) {
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv2i1_x = insertelement <vscale x 2 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv4i1_x = insertelement <vscale x 4 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i1_x = insertelement <vscale x 8 x i1> undef, i1 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
-; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
+; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_x = insertelement <2 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_x = insertelement <4 x i8> undef, i8 undef, i32 %x
 ; RV32ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_x = insertelement <8 x i8> undef, i8 undef, i32 %x
@@ -506,13 +506,13 @@ define void @insertelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2i1_0 = insertelement <2 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i1_0 = insertelement <4 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i1_0 = insertelement <8 x i1> undef, i1 undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_0 = insertelement <16 x i1> undef, i1 undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v32i1_0 = insertelement <32 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv2i1_0 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv4i1_0 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv8i1_0 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_0 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 0
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i1_0 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -560,13 +560,13 @@ define void @insertelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1_1 = insertelement <2 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1_1 = insertelement <4 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1_1 = insertelement <8 x i1> undef, i1 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_1 = insertelement <16 x i1> undef, i1 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i1_1 = insertelement <32 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv2i1_1 = insertelement <vscale x 2 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i1_1 = insertelement <vscale x 4 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i1_1 = insertelement <vscale x 8 x i1> undef, i1 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_1 = insertelement <vscale x 16 x i1> undef, i1 undef, i32 1
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv32i1_1 = insertelement <vscale x 32 x i1> undef, i1 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_1 = insertelement <2 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_1 = insertelement <4 x i8> undef, i8 undef, i32 1
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_1 = insertelement <8 x i8> undef, i8 undef, i32 1
@@ -614,13 +614,13 @@ define void @insertelement_int(i32 %x) {
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1_x = insertelement <2 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_x = insertelement <4 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1_x = insertelement <8 x i1> undef, i1 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i1_x = insertelement <16 x i1> undef, i1 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32i1_x = insertelement <32 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv2i1_x = insertelement <vscale x 2 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv4i1_x = insertelement <vscale x 4 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i1_x = insertelement <vscale x 8 x i1> undef, i1 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
-; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i1_x = insertelement <vscale x 16 x i1> undef, i1 undef, i32 %x
+; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i1_x = insertelement <vscale x 32 x i1> undef, i1 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_x = insertelement <2 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_x = insertelement <4 x i8> undef, i8 undef, i32 %x
 ; RV64ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_x = insertelement <8 x i8> undef, i8 undef, i32 %x
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
index 46bf3152ac5bd3..b763198e98bacd 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
@@ -197,7 +197,7 @@ define void  @broadcast_fixed() #0{
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ins2 = insertelement <2 x i8> poison, i8 3, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer

>From 6d13263d4a723689d025423562269ea6ccb6bfc2 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 27 Mar 2024 15:23:34 +0800
Subject: [PATCH 02/54] [RISCV] Add tests for combineBinOpOfZExts. NFC (#86689)

Unlike add, sub and mul, we don't have widening instructions for div,
rem and logical ops, so we don't have any test coverage if we were to
extend combineBinOpOfZExts to handle them.

Adding tests coincidentally revealed that logical ops are already
narrowed as a generic DAG combine via
DAGCombiner::hoistLogicOpWithSameOpcodeHands. So we don't actually need
to run combineBinOpOfZExts on them.
---
 llvm/test/CodeGen/RISCV/rvv/binop-zext.ll | 146 ++++++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/binop-zext.ll

diff --git a/llvm/test/CodeGen/RISCV/rvv/binop-zext.ll b/llvm/test/CodeGen/RISCV/rvv/binop-zext.ll
new file mode 100644
index 00000000000000..e050240f0de114
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/binop-zext.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+; Check that we perform binary arithmetic in a narrower type where possible, via
+; combineBinOpOfZExt or otherwise.
+
+define <vscale x 8 x i32> @add(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwaddu.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %add = add <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %add
+}
+
+define <vscale x 8 x i32> @sub(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwsubu.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %sub = sub <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %sub
+}
+
+define <vscale x 8 x i32> @mul(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: mul:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf2 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %mul = mul <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %mul
+}
+
+define <vscale x 8 x i32> @sdiv(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: sdiv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v12, v8
+; CHECK-NEXT:    vzext.vf4 v16, v9
+; CHECK-NEXT:    vdivu.vv v8, v12, v16
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %sdiv = sdiv <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %sdiv
+}
+
+define <vscale x 8 x i32> @udiv(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: udiv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v12, v8
+; CHECK-NEXT:    vzext.vf4 v16, v9
+; CHECK-NEXT:    vdivu.vv v8, v12, v16
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %udiv = udiv <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %udiv
+}
+
+define <vscale x 8 x i32> @srem(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: srem:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v12, v8
+; CHECK-NEXT:    vzext.vf4 v16, v9
+; CHECK-NEXT:    vremu.vv v8, v12, v16
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %srem = srem <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %srem
+}
+
+define <vscale x 8 x i32> @urem(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: urem:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v12, v8
+; CHECK-NEXT:    vzext.vf4 v16, v9
+; CHECK-NEXT:    vremu.vv v8, v12, v16
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %urem = urem <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %urem
+}
+
+define <vscale x 8 x i32> @and(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: and:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %shl = and <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %shl
+}
+
+define <vscale x 8 x i32> @or(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: or:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vor.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %or = or <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %or
+}
+
+define <vscale x 8 x i32> @xor(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: xor:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vxor.vv v12, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vzext.vf4 v8, v12
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+  %xor = xor <vscale x 8 x i32> %a.zext, %b.zext
+  ret <vscale x 8 x i32> %xor
+}

>From defc4859b032ccaec69f24b6cfd9882fece5f093 Mon Sep 17 00:00:00 2001
From: Jack Styles <99514724+Stylie777 at users.noreply.github.com>
Date: Wed, 27 Mar 2024 07:49:38 +0000
Subject: [PATCH 03/54] [AArch64] Remove Automatic Enablement of FEAT_F32MM
 (#85203)

When `+sve` is passed in the command line, if the Architecture being
targeted is V8.6A/V9.1A or later, `+f32mm` is also added. This enables
FEAT_32MM, however at the time of writing no CPU's support this. This
leads to the FEAT_32MM instructions being compiled for CPU's that do not
support them.

This commit removes the automatic enablement, however the option is
still able to be used by passing `+f32mm`.
---
 clang/test/Driver/aarch64-sve.c                   |  9 ++++-----
 clang/test/Preprocessor/aarch64-target-features.c |  2 +-
 llvm/docs/ReleaseNotes.rst                        |  1 +
 llvm/lib/TargetParser/AArch64TargetParser.cpp     |  5 -----
 llvm/unittests/TargetParser/TargetParserTest.cpp  | 15 ++++-----------
 5 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/clang/test/Driver/aarch64-sve.c b/clang/test/Driver/aarch64-sve.c
index f34b2700deb91c..4a33c2e3c8d367 100644
--- a/clang/test/Driver/aarch64-sve.c
+++ b/clang/test/Driver/aarch64-sve.c
@@ -6,12 +6,11 @@
 // RUN: %clang --target=aarch64 -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV8A-NOSVE %s
 // GENERICV8A-NOSVE-NOT: "-target-feature" "+sve"
 
-// The 32-bit floating point matrix multiply extension is enabled by default
-// for armv8.6-a targets (or later) with SVE, and can optionally be enabled for
-// any target from armv8.2a onwards (we don't enforce not using it with earlier
-// targets).
+// The 32-bit floating point matrix multiply extension is an optional feature
+// that can be used for any target from armv8.2a and onwards. This can be
+// enabled using the `+f32mm` option.`.
 // RUN: %clang --target=aarch64 -march=armv8.6a       -### -c %s 2>&1 | FileCheck -check-prefix=NO-F32MM %s
-// RUN: %clang --target=aarch64 -march=armv8.6a+sve   -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s
+// RUN: %clang --target=aarch64 -march=armv8.6a+sve+f32mm   -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s
 // RUN: %clang --target=aarch64 -march=armv8.5a+f32mm -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s
 // NO-F32MM-NOT: "-target-feature" "+f32mm"
 // F32MM: "-target-feature" "+f32mm"
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 9f8a8bdeeb9cb0..85762b7fed4d71 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -196,7 +196,7 @@
 // CHECK-8_6-NOT: __ARM_FEATURE_SHA3 1
 // CHECK-8_6-NOT: __ARM_FEATURE_SM4 1
 
-// RUN: %clang -target aarch64-none-linux-gnu -march=armv8.6-a+sve -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE-8_6 %s
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv8.6-a+sve+f32mm -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE-8_6 %s
 // CHECK-SVE-8_6: __ARM_FEATURE_SVE 1
 // CHECK-SVE-8_6: __ARM_FEATURE_SVE_BF16 1
 // CHECK-SVE-8_6: __ARM_FEATURE_SVE_MATMUL_FP32 1
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index c2b1a9d3d73835..7588048334d792 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -76,6 +76,7 @@ Changes to the AMDGPU Backend
 
 Changes to the ARM Backend
 --------------------------
+* FEAT_F32MM is no longer activated by default when using `+sve` on v8.6-A or greater. The feature is still available and can be used by adding `+f32mm` to the command line options.
 
 Changes to the AVR Backend
 --------------------------
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index e36832f563eed8..71099462d5ecff 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -186,11 +186,6 @@ void AArch64::ExtensionSet::enable(ArchExtKind E) {
   // Special cases for dependencies which vary depending on the base
   // architecture version.
   if (BaseArch) {
-    // +sve implies +f32mm if the base architecture is v8.6A+ or v9.1A+
-    // It isn't the case in general that sve implies both f64mm and f32mm
-    if (E == AEK_SVE && BaseArch->is_superset(ARMV8_6A))
-      enable(AEK_F32MM);
-
     // +fp16 implies +fp16fml for v8.4A+, but not v9.0-A+
     if (E == AEK_FP16 && BaseArch->is_superset(ARMV8_4A) &&
         !BaseArch->is_superset(ARMV9A))
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index a7d0b1687a7f91..2c72a7229b5274 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -2347,13 +2347,6 @@ AArch64ExtensionDependenciesBaseArchTestParams
          {},
          {"aes", "sha2", "sha3", "sm4"}},
 
-        // +sve implies +f32mm if the base architecture is v8.6A+ or v9.1A+, but
-        // not earlier architectures.
-        {AArch64::ARMV8_5A, {"sve"}, {"sve"}, {"f32mm"}},
-        {AArch64::ARMV9A, {"sve"}, {"sve"}, {"f32mm"}},
-        {AArch64::ARMV8_6A, {"sve"}, {"sve", "f32mm"}, {}},
-        {AArch64::ARMV9_1A, {"sve"}, {"sve", "f32mm"}, {}},
-
         // +fp16 implies +fp16fml for v8.4A+, but not v9.0-A+
         {AArch64::ARMV8_3A, {"fp16"}, {"fullfp16"}, {"fp16fml"}},
         {AArch64::ARMV9A, {"fp16"}, {"fullfp16"}, {"fp16fml"}},
@@ -2520,10 +2513,10 @@ AArch64ExtensionDependenciesBaseCPUTestParams
          {}},
         {"cortex-a520",
          {},
-         {"v9.2a",    "bf16",     "crc",     "dotprod", "f32mm",        "flagm",
-          "fp-armv8", "fullfp16", "fp16fml", "i8mm",    "lse",          "mte",
-          "pauth",    "perfmon",  "predres", "ras",     "rcpc",         "rdm",
-          "sb",       "neon",     "ssbs",    "sve",     "sve2-bitperm", "sve2"},
+         {"v9.2a",    "bf16",    "crc",  "dotprod",      "flagm", "fp-armv8",
+          "fullfp16", "fp16fml", "i8mm", "lse",          "mte",   "pauth",
+          "perfmon",  "predres", "ras",  "rcpc",         "rdm",   "sb",
+          "neon",     "ssbs",    "sve",  "sve2-bitperm", "sve2"},
          {}},
 
         // Negative modifiers

>From 2938f1cff9f880d03c900a2bdcd078af937d9433 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Sun, 24 Mar 2024 05:56:52 -0400
Subject: [PATCH 04/54] [InstCombine] Refactor powi(X,Y) / X to call
 foldPowiReassoc, NFC

---
 .../InstCombine/InstCombineMulDivRem.cpp      | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index af238a43b11a05..a9f0a16be0b8c9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -611,6 +611,18 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
       Y->getType() == Z->getType())
     return createPowiExpr(I, *this, X, Y, Z);
 
+  // powi(X, Y) / X --> powi(X, Y-1)
+  // This is legal when (Y - 1) can't wraparound, in which case reassoc and nnan
+  // are required.
+  // TODO: Multi-use may be also better off creating Powi(x,y-1)
+  if (I.hasAllowReassoc() && I.hasNoNaNs() &&
+      match(Op0, m_OneUse(m_Intrinsic<Intrinsic::powi>(m_Specific(Op1),
+                                                       m_Value(Y)))) &&
+      willNotOverflowSignedSub(Y, ConstantInt::get(Y->getType(), 1), I)) {
+    Constant *NegOne = ConstantInt::getAllOnesValue(Y->getType());
+    return createPowiExpr(I, *this, Op1, Y, NegOne);
+  }
+
   return nullptr;
 }
 
@@ -1904,20 +1916,8 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
     return replaceInstUsesWith(I, Pow);
   }
 
-  // powi(X, Y) / X --> powi(X, Y-1)
-  // This is legal when (Y - 1) can't wraparound, in which case reassoc and nnan
-  // are required.
-  // TODO: Multi-use may be also better off creating Powi(x,y-1)
-  if (I.hasAllowReassoc() && I.hasNoNaNs() &&
-      match(Op0, m_OneUse(m_Intrinsic<Intrinsic::powi>(m_Specific(Op1),
-                                                       m_Value(Y)))) &&
-      willNotOverflowSignedSub(Y, ConstantInt::get(Y->getType(), 1), I)) {
-    Constant *NegOne = ConstantInt::getAllOnesValue(Y->getType());
-    Value *Y1 = Builder.CreateAdd(Y, NegOne);
-    Type *Types[] = {Op1->getType(), Y1->getType()};
-    Value *Pow = Builder.CreateIntrinsic(Intrinsic::powi, Types, {Op1, Y1}, &I);
-    return replaceInstUsesWith(I, Pow);
-  }
+  if (Instruction *FoldedPowi = foldPowiReassoc(I))
+    return FoldedPowi;
 
   return nullptr;
 }

>From bd9bb31bce0754c0a04d5c842ab3e7f8dd467861 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Sun, 24 Mar 2024 06:16:41 -0400
Subject: [PATCH 05/54] [InstCombine] add restrict reassoc for the powi(X,Y) /
 X

add restrict reassoc for the powi(X,Y) / X according the discuss on PR69998.
---
 .../InstCombine/InstCombineMulDivRem.cpp        |  4 ++--
 llvm/test/Transforms/InstCombine/powi.ll        | 17 ++++++++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index a9f0a16be0b8c9..8c698e52b5a0e6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -616,8 +616,8 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
   // are required.
   // TODO: Multi-use may be also better off creating Powi(x,y-1)
   if (I.hasAllowReassoc() && I.hasNoNaNs() &&
-      match(Op0, m_OneUse(m_Intrinsic<Intrinsic::powi>(m_Specific(Op1),
-                                                       m_Value(Y)))) &&
+      match(Op0, m_OneUse(m_AllowReassoc(m_Intrinsic<Intrinsic::powi>(
+                     m_Specific(Op1), m_Value(Y))))) &&
       willNotOverflowSignedSub(Y, ConstantInt::get(Y->getType(), 1), I)) {
     Constant *NegOne = ConstantInt::getAllOnesValue(Y->getType());
     return createPowiExpr(I, *this, Op1, Y, NegOne);
diff --git a/llvm/test/Transforms/InstCombine/powi.ll b/llvm/test/Transforms/InstCombine/powi.ll
index 43e34c889106e1..6c0575e8b71971 100644
--- a/llvm/test/Transforms/InstCombine/powi.ll
+++ b/llvm/test/Transforms/InstCombine/powi.ll
@@ -313,7 +313,7 @@ define double @fdiv_pow_powi(double %x) {
 ; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc nnan double [[X:%.*]], [[X]]
 ; CHECK-NEXT:    ret double [[DIV]]
 ;
-  %p1 = call double @llvm.powi.f64.i32(double %x, i32 3)
+  %p1 = call reassoc double @llvm.powi.f64.i32(double %x, i32 3)
   %div = fdiv reassoc nnan double %p1, %x
   ret double %div
 }
@@ -323,7 +323,7 @@ define float @fdiv_powf_powi(float %x) {
 ; CHECK-NEXT:    [[DIV:%.*]] = call reassoc nnan float @llvm.powi.f32.i32(float [[X:%.*]], i32 99)
 ; CHECK-NEXT:    ret float [[DIV]]
 ;
-  %p1 = call float @llvm.powi.f32.i32(float %x, i32 100)
+  %p1 = call reassoc float @llvm.powi.f32.i32(float %x, i32 100)
   %div = fdiv reassoc nnan float %p1, %x
   ret float %div
 }
@@ -347,10 +347,21 @@ define double @fdiv_pow_powi_multi_use(double %x) {
 define float @fdiv_powf_powi_missing_reassoc(float %x) {
 ; CHECK-LABEL: @fdiv_powf_powi_missing_reassoc(
 ; CHECK-NEXT:    [[P1:%.*]] = call float @llvm.powi.f32.i32(float [[X:%.*]], i32 100)
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv nnan float [[P1]], [[X]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc nnan float [[P1]], [[X]]
 ; CHECK-NEXT:    ret float [[DIV]]
 ;
   %p1 = call float @llvm.powi.f32.i32(float %x, i32 100)
+  %div = fdiv reassoc nnan float %p1, %x
+  ret float %div
+}
+
+define float @fdiv_powf_powi_missing_reassoc1(float %x) {
+; CHECK-LABEL: @fdiv_powf_powi_missing_reassoc1(
+; CHECK-NEXT:    [[P1:%.*]] = call reassoc float @llvm.powi.f32.i32(float [[X:%.*]], i32 100)
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv nnan float [[P1]], [[X]]
+; CHECK-NEXT:    ret float [[DIV]]
+;
+  %p1 = call reassoc float @llvm.powi.f32.i32(float %x, i32 100)
   %div = fdiv nnan float %p1, %x
   ret float %div
 }

>From df75183d70e029352a49c93f275db703c81a65c1 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele at apple.com>
Date: Wed, 27 Mar 2024 09:30:27 +0000
Subject: [PATCH 06/54] [TBAA] Add verifier for tbaa.struct metadata (#86709)

Adds logic to the IR verifier that checks whether !tbaa.struct nodes are
well-formed. That is, it checks that the operands of !tbaa.struct nodes
are in groups of three, that each group of three operands consists of
two integers and a valid tbaa node, and that the regions described by
the offset and size operands are non-overlapping.

PR: https://github.com/llvm/llvm-project/pull/86709
---
 llvm/include/llvm/IR/Verifier.h               |  1 +
 llvm/lib/IR/Verifier.cpp                      | 32 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/arm64-abi_align.ll  |  4 ++-
 .../AMDGPU/mem-intrinsics.ll                  |  2 +-
 .../InstCombine/struct-assign-tbaa.ll         |  2 +-
 llvm/test/Transforms/SROA/tbaa-struct3.ll     |  2 +-
 .../Scalarizer/basic-inseltpoison.ll          |  3 +-
 llvm/test/Transforms/Scalarizer/basic.ll      |  3 +-
 llvm/test/Verifier/tbaa-struct.ll             | 14 ++++++--
 9 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/IR/Verifier.h b/llvm/include/llvm/IR/Verifier.h
index b25f8eb77ee38b..b7db6e0bbfb71c 100644
--- a/llvm/include/llvm/IR/Verifier.h
+++ b/llvm/include/llvm/IR/Verifier.h
@@ -77,6 +77,7 @@ class TBAAVerifier {
   /// Visit an instruction and return true if it is valid, return false if an
   /// invalid TBAA is attached.
   bool visitTBAAMetadata(Instruction &I, const MDNode *MD);
+  bool visitTBAAStructMetadata(Instruction &I, const MDNode *MD);
 };
 
 /// Check a function for errors, useful for use when debugging a
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 33f358440a312d..e16572540f96c2 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5096,6 +5096,9 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa))
     TBAAVerifyHelper.visitTBAAMetadata(I, TBAA);
 
+  if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa_struct))
+    TBAAVerifyHelper.visitTBAAStructMetadata(I, TBAA);
+
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_noalias))
     visitAliasScopeListMetadata(MD);
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_alias_scope))
@@ -7419,6 +7422,35 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) {
   return true;
 }
 
+bool TBAAVerifier::visitTBAAStructMetadata(Instruction &I, const MDNode *MD) {
+  CheckTBAA(MD->getNumOperands() % 3 == 0,
+            "tbaa.struct operands must occur in groups of three", &I, MD);
+
+  // Each group of three operands must consist of two integers and a
+  // tbaa node. Moreover, the regions described by the offset and size
+  // operands must be non-overlapping.
+  std::optional<APInt> NextFree;
+  for (unsigned int Idx = 0; Idx < MD->getNumOperands(); Idx += 3) {
+    auto *OffsetCI =
+        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx));
+    CheckTBAA(OffsetCI, "Offset must be a constant integer", &I, MD);
+
+    auto *SizeCI =
+        mdconst::dyn_extract_or_null<ConstantInt>(MD->getOperand(Idx + 1));
+    CheckTBAA(SizeCI, "Size must be a constant integer", &I, MD);
+
+    MDNode *TBAA = dyn_cast_or_null<MDNode>(MD->getOperand(Idx + 2));
+    CheckTBAA(TBAA, "TBAA tag missing", &I, MD);
+    visitTBAAMetadata(I, TBAA);
+
+    bool NonOverlapping = !NextFree || NextFree->ule(OffsetCI->getValue());
+    CheckTBAA(NonOverlapping, "Overlapping tbaa.struct regions", &I, MD);
+
+    NextFree = OffsetCI->getValue() + SizeCI->getValue();
+  }
+  return true;
+}
+
 char VerifierLegacyPass::ID = 0;
 INITIALIZE_PASS(VerifierLegacyPass, "verify", "Module Verifier", false, false)
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
index 089e171e5a4a79..c9fd2d38e27acd 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -518,4 +518,6 @@ attributes #5 = { nobuiltin }
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
 !3 = !{!"short", !1}
-!4 = !{i64 0, i64 4, !0, i64 4, i64 2, !3, i64 8, i64 4, !0, i64 12, i64 2, !3, i64 16, i64 4, !0, i64 20, i64 2, !3}
+!4 = !{i64 0, i64 4, !5, i64 4, i64 2, !6, i64 8, i64 4, !5, i64 12, i64 2, !6, i64 16, i64 4, !5, i64 20, i64 2, !6}
+!5 = !{!0, !0, i64 0}
+!6 = !{!3, !3, i64 0}
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
index 50b0e7a0f5471b..2f264a2432fc3d 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@@ -141,4 +141,4 @@ attributes #1 = { argmemonly nounwind }
 !5 = distinct !{!5, !"some domain"}
 !6 = !{!7}
 !7 = distinct !{!7, !5, !"some scope 2"}
-!8 = !{i64 0, i64 8, null}
+!8 = !{i64 0, i64 8, !0}
diff --git a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
index 996d2c0e67e165..d079c03f1dcb93 100644
--- a/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/llvm/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -75,7 +75,7 @@ entry:
 !1 = !{!"omnipotent char", !0}
 !2 = !{!5, !5, i64 0}
 !3 = !{i64 0, i64 4, !2}
-!4 = !{i64 0, i64 8, null}
+!4 = !{i64 0, i64 8, !2}
 !5 = !{!"float", !0}
 !6 = !{i64 0, i64 4, !2, i64 4, i64 4, !2}
 !7 = !{i64 0, i64 2, !2, i64 4, i64 6, !2}
diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll
index 0fcd787fef9769..61034de81e4b27 100644
--- a/llvm/test/Transforms/SROA/tbaa-struct3.ll
+++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll
@@ -539,7 +539,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias
 !6 = !{!5, !5, i64 0}
 !7 = !{i64 0, i64 8, !6, i64 8, i64 4, !1}
 !8 = !{i64 0, i64 4, !1, i64 4, i64 8, !6}
-!9 = !{i64 0, i64 8, !6, i64 4, i64 8, !1}
+!9 = !{i64 0, i64 8, !6, i64 8, i64 8, !1}
 !10 = !{i64 0, i64 2, !1, i64 2, i64 2, !1}
 !11 = !{i64 0, i64 1, !1, i64 1, i64 3, !1}
 !12 = !{i64 0, i64 2, !1, i64 2, i64 6, !1}
diff --git a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
index bbcdcb6f586742..73ae66dd76c66e 100644
--- a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
+++ b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll
@@ -836,5 +836,6 @@ define <2 x i32> @f23_crash(<2 x i32> %srcvec, i32 %v1) {
 !2 = !{ !"set2", !0 }
 !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
 !4 = !{ float 4.0 }
-!5 = !{ i64 0, i64 8, null }
+!5 = !{ i64 0, i64 8, !6 }
+!6 = !{ !1, !1, i64 0 }
 !13 = distinct !{}
diff --git a/llvm/test/Transforms/Scalarizer/basic.ll b/llvm/test/Transforms/Scalarizer/basic.ll
index db7c5f535f7e9d..87a70ccd3fc7c5 100644
--- a/llvm/test/Transforms/Scalarizer/basic.ll
+++ b/llvm/test/Transforms/Scalarizer/basic.ll
@@ -870,5 +870,6 @@ define <2 x float> @f25(<2 x float> %src) {
 !2 = !{ !"set2", !0 }
 !3 = !{ !3, !{!"llvm.loop.parallel_accesses", !13} }
 !4 = !{ float 4.0 }
-!5 = !{ i64 0, i64 8, null }
+!5 = !{ i64 0, i64 8, !6 }
+!6 = !{ !1, !1, i64 0 }
 !13 = distinct !{}
diff --git a/llvm/test/Verifier/tbaa-struct.ll b/llvm/test/Verifier/tbaa-struct.ll
index b8ddc7cee496a9..14c19a19d5ae89 100644
--- a/llvm/test/Verifier/tbaa-struct.ll
+++ b/llvm/test/Verifier/tbaa-struct.ll
@@ -1,28 +1,36 @@
-; RUN: llvm-as < %s 2>&1
-
-; FIXME: The verifer should reject the invalid !tbaa.struct nodes below.
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
 
 define void @test_overlapping_regions(ptr %a1) {
+; CHECK: Overlapping tbaa.struct regions
+; CHECK-NEXT:  %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
   %ld = load i8, ptr %a1, align 1, !tbaa.struct !0
   ret void
 }
 
 define void @test_size_not_integer(ptr %a1) {
+; CHECK: Size must be a constant integer
+; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !5
   store i8 1, ptr %a1, align 1, !tbaa.struct !5
   ret void
 }
 
 define void @test_offset_not_integer(ptr %a1, ptr %a2) {
+; CHECK: Offset must be a constant integer
+; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !6
   ret void
 }
 
 define void @test_tbaa_missing(ptr %a1, ptr %a2) {
+; CHECK: TBAA tag missing
+; CHECK-NEXT:  tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a1, ptr align 8 %a2, i64 16, i1 false), !tbaa.struct !7
   ret void
 }
 
 define void @test_tbaa_invalid(ptr %a1) {
+; CHECK: Old-style TBAA is no longer allowed, use struct-path TBAA instead
+; CHECK-NEXT:  store i8 1, ptr %a1, align 1, !tbaa.struct !8
   store i8 1, ptr %a1, align 1, !tbaa.struct !8
   ret void
 }

>From f15b7deeaaf9028a31f66110a10f1313ed5e57f7 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 27 Mar 2024 17:40:44 +0800
Subject: [PATCH 07/54] [RISCV] Add test case to show missing vmerge fold on
 tied pseudos. NFC

Note we can't use vwaddu.wv because it will get combined away with #78403
---
 .../RISCV/rvv/rvv-peephole-vmerge-vops.ll        | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index a4aef577bc9ae7..571e2df13c2636 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -1187,3 +1187,19 @@ define <vscale x 2 x i32> @vmerge_larger_vl_false_becomes_tail(<vscale x 2 x i32
   %b = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i32> %false, <vscale x 2 x i32> %a, <vscale x 2 x i1> %m, i64 3)
   ret <vscale x 2 x i32> %b
 }
+
+; Test widening pseudos with their TIED variant (passthru same as first op).
+define <vscale x 2 x i64> @vpmerge_vwsub.w_tied(<vscale x 2 x i64> %passthru, <vscale x 2 x i64> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask, i32 zeroext %vl) {
+; CHECK-LABEL: vpmerge_vwsub.w_tied:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT:    vmv2r.v v10, v8
+; CHECK-NEXT:    vwsub.wv v10, v10, v12
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %vl.zext = zext i32 %vl to i64
+  %a = call <vscale x 2 x i64> @llvm.riscv.vwsub.w.nxv2i64.nxv2i32(<vscale x 2 x i64> %passthru, <vscale x 2 x i64> %passthru, <vscale x 2 x i32> %y, i64 %vl.zext)
+  %b = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %a, <vscale x 2 x i64> %passthru, i32 %vl)
+  ret <vscale x 2 x i64> %b
+}

>From cc23ee8250c2eda3f28c4d25c412e68ec78ecbe1 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek at codeweavers.com>
Date: Wed, 27 Mar 2024 11:37:02 +0100
Subject: [PATCH 08/54] [LLD][COFF] Add support for EXPORTAS import name type.
 (#86541)

#78772 added similar support for .def file parser and import library
writer. This PR adds missing bits in LLD to propagate EXPORTAS name and
allow it in `/export` parser. This is syntax is used by MSVC for ARM64EC
`__declspec(dllexport)` handling.
---
 lld/COFF/Config.h           |  8 ++--
 lld/COFF/Driver.cpp         |  2 +
 lld/COFF/DriverUtils.cpp    | 14 +++++-
 lld/test/COFF/exportas.test | 88 +++++++++++++++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 8f85929f1bea7f..917f88fc28280b 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -54,6 +54,7 @@ enum class EmitKind { Obj, LLVM, ASM };
 struct Export {
   StringRef name;       // N in /export:N or /export:E=N
   StringRef extName;    // E in /export:E=N
+  StringRef exportAs;   // E in /export:N,EXPORTAS,E
   StringRef aliasTarget; // GNU specific: N in "alias == N"
   Symbol *sym = nullptr;
   uint16_t ordinal = 0;
@@ -73,10 +74,9 @@ struct Export {
   StringRef exportName; // Name in DLL
 
   bool operator==(const Export &e) const {
-    return (name == e.name && extName == e.extName &&
-            aliasTarget == e.aliasTarget &&
-            ordinal == e.ordinal && noname == e.noname &&
-            data == e.data && isPrivate == e.isPrivate);
+    return (name == e.name && extName == e.extName && exportAs == e.exportAs &&
+            aliasTarget == e.aliasTarget && ordinal == e.ordinal &&
+            noname == e.noname && data == e.data && isPrivate == e.isPrivate);
   }
 };
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 181492913c0d98..2b1d4abb6ed0d6 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -945,6 +945,7 @@ void LinkerDriver::createImportLibrary(bool asLib) {
     e2.Name = std::string(e1.name);
     e2.SymbolName = std::string(e1.symbolName);
     e2.ExtName = std::string(e1.extName);
+    e2.ExportAs = std::string(e1.exportAs);
     e2.AliasTarget = std::string(e1.aliasTarget);
     e2.Ordinal = e1.ordinal;
     e2.Noname = e1.noname;
@@ -1044,6 +1045,7 @@ void LinkerDriver::parseModuleDefs(StringRef path) {
       e2.name = saver().save(e1.Name);
       e2.extName = saver().save(e1.ExtName);
     }
+    e2.exportAs = saver().save(e1.ExportAs);
     e2.aliasTarget = saver().save(e1.AliasTarget);
     e2.ordinal = e1.Ordinal;
     e2.noname = e1.Noname;
diff --git a/lld/COFF/DriverUtils.cpp b/lld/COFF/DriverUtils.cpp
index 0fa4769bab19db..b4ff31a606da5e 100644
--- a/lld/COFF/DriverUtils.cpp
+++ b/lld/COFF/DriverUtils.cpp
@@ -585,7 +585,8 @@ Export LinkerDriver::parseExport(StringRef arg) {
     }
   }
 
-  // Optional parameters "[, at ordinal[,NONAME]][,DATA][,PRIVATE]"
+  // Optional parameters
+  // "[, at ordinal[,NONAME]][,DATA][,PRIVATE][,EXPORTAS,exportname]"
   while (!rest.empty()) {
     StringRef tok;
     std::tie(tok, rest) = rest.split(",");
@@ -607,6 +608,13 @@ Export LinkerDriver::parseExport(StringRef arg) {
       e.isPrivate = true;
       continue;
     }
+    if (tok.equals_insensitive("exportas")) {
+      if (!rest.empty() && !rest.contains(','))
+        e.exportAs = rest;
+      else
+        error("invalid EXPORTAS value: " + rest);
+      break;
+    }
     if (tok.starts_with("@")) {
       int32_t ord;
       if (tok.substr(1).getAsInteger(0, ord))
@@ -683,7 +691,9 @@ void LinkerDriver::fixupExports() {
   }
 
   for (Export &e : ctx.config.exports) {
-    if (!e.forwardTo.empty()) {
+    if (!e.exportAs.empty()) {
+      e.exportName = e.exportAs;
+    } else if (!e.forwardTo.empty()) {
       e.exportName = undecorate(ctx, e.name);
     } else {
       e.exportName = undecorate(ctx, e.extName.empty() ? e.name : e.extName);
diff --git a/lld/test/COFF/exportas.test b/lld/test/COFF/exportas.test
index c0295c3d7fb76d..d70547c39b40b4 100644
--- a/lld/test/COFF/exportas.test
+++ b/lld/test/COFF/exportas.test
@@ -9,6 +9,77 @@ RUN: lld-link -out:out1.dll -dll -noentry test.obj test.lib
 RUN: llvm-readobj --coff-imports out1.dll | FileCheck --check-prefix=IMPORT %s
 IMPORT: Symbol: expfunc
 
+Pass -export argument with EXPORTAS.
+
+RUN: llvm-mc -filetype=obj -triple=x86_64-windows func.s -o func.obj
+RUN: lld-link -out:out2.dll -dll -noentry func.obj -export:func,EXPORTAS,expfunc
+RUN: llvm-readobj --coff-exports out2.dll | FileCheck --check-prefix=EXPORT %s
+EXPORT: Name: expfunc
+
+RUN: llvm-readobj out2.lib | FileCheck --check-prefix=IMPLIB %s
+IMPLIB:      Name type: export as
+IMPLIB-NEXT: Export name: expfunc
+IMPLIB-NEXT: Symbol: __imp_func
+IMPLIB-NEXT: Symbol: func
+
+Use .drectve section with EXPORTAS.
+
+RUN: llvm-mc -filetype=obj -triple=x86_64-windows drectve.s -o drectve.obj
+RUN: lld-link -out:out3.dll -dll -noentry func.obj drectve.obj
+RUN: llvm-readobj --coff-exports out3.dll | FileCheck --check-prefix=EXPORT %s
+RUN: llvm-readobj out3.lib | FileCheck --check-prefix=IMPLIB %s
+
+Use a .def file with EXPORTAS.
+
+RUN: lld-link -out:out4.dll -dll -noentry func.obj -def:test.def
+RUN: llvm-readobj --coff-exports out4.dll | FileCheck --check-prefix=EXPORT %s
+RUN: llvm-readobj out4.lib | FileCheck --check-prefix=IMPLIB %s
+
+Use a .def file with EXPORTAS in a forwarding export.
+
+RUN: lld-link -out:out5.dll -dll -noentry func.obj -def:test2.def
+RUN: llvm-readobj --coff-exports out5.dll | FileCheck --check-prefix=FORWARD-EXPORT %s
+FORWARD-EXPORT:      Export {
+FORWARD-EXPORT-NEXT:   Ordinal: 1
+FORWARD-EXPORT-NEXT:   Name: expfunc
+FORWARD-EXPORT-NEXT:   ForwardedTo: otherdll.otherfunc
+FORWARD-EXPORT-NEXT: }
+
+RUN: llvm-readobj out5.lib | FileCheck --check-prefix=FORWARD-IMPLIB %s
+FORWARD-IMPLIB:      Name type: export as
+FORWARD-IMPLIB-NEXT: Export name: expfunc
+FORWARD-IMPLIB-NEXT: Symbol: __imp_func
+FORWARD-IMPLIB-NEXT: Symbol: func
+
+Pass -export argument with EXPORTAS in a forwarding export.
+
+RUN: lld-link -out:out6.dll -dll -noentry func.obj -export:func=otherdll.otherfunc,EXPORTAS,expfunc
+RUN: llvm-readobj --coff-exports out6.dll | FileCheck --check-prefix=FORWARD-EXPORT %s
+RUN: llvm-readobj out6.lib | FileCheck --check-prefix=FORWARD-IMPLIB %s
+
+Pass -export argument with EXPORTAS in a data export.
+
+RUN: lld-link -out:out7.dll -dll -noentry func.obj -export:func,DATA, at 5,EXPORTAS,expfunc
+RUN: llvm-readobj --coff-exports out7.dll | FileCheck --check-prefix=ORD %s
+ORD:      Ordinal: 5
+ORD-NEXT: Name: expfunc
+
+RUN: llvm-readobj out7.lib | FileCheck --check-prefix=ORD-IMPLIB %s
+ORD-IMPLIB:      Type: data
+ORD-IMPLIB-NEXT: Name type: export as
+ORD-IMPLIB-NEXT: Export name: expfunc
+ORD-IMPLIB-NEXT: Symbol: __imp_func
+
+Check invalid EXPORTAS syntax.
+
+RUN: not lld-link -out:err1.dll -dll -noentry func.obj -export:func,EXPORTAS, 2>&1 | \
+RUN:     FileCheck --check-prefix=ERR1 %s
+ERR1: error: invalid EXPORTAS value: {{$}}
+
+RUN: not lld-link -out:err2.dll -dll -noentry func.obj -export:func,EXPORTAS,expfunc,DATA 2>&1 | \
+RUN:     FileCheck --check-prefix=ERR2 %s
+ERR2: error: invalid EXPORTAS value: expfunc,DATA
+
 #--- test.s
     .section ".test", "rd"
     .rva __imp_func
@@ -17,3 +88,20 @@ IMPORT: Symbol: expfunc
 LIBRARY test.dll
 EXPORTS
     func EXPORTAS expfunc
+
+#--- test2.def
+LIBRARY test.dll
+EXPORTS
+    func=otherdll.otherfunc EXPORTAS expfunc
+
+#--- func.s
+    .text
+    .globl func
+    .p2align 2, 0x0
+func:
+    movl $1, %eax
+    retq
+
+#--- drectve.s
+    .section .drectve, "yn"
+    .ascii " -export:func,EXPORTAS,expfunc"

>From c9d12664f2f967ec170ed16d9a57af2f48e832c8 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek at codeweavers.com>
Date: Wed, 27 Mar 2024 11:41:02 +0100
Subject: [PATCH 09/54] [llvm-dlltool][llvm-lib][COFF] Don't override NONAME
 exports with demangled ARM64EC symbols. (#86722)

---
 llvm/lib/Object/COFFImportFile.cpp           |  4 +-
 llvm/test/tools/llvm-lib/arm64ec-implib.test | 93 ++++++++++++++++++++
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index 8224a1492502f6..477c5bf98249f7 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -690,12 +690,12 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
       if (ImportType == IMPORT_CODE && isArm64EC(M)) {
         if (std::optional<std::string> MangledName =
                 getArm64ECMangledFunctionName(Name)) {
-          if (ExportName.empty()) {
+          if (!E.Noname && ExportName.empty()) {
             NameType = IMPORT_NAME_EXPORTAS;
             ExportName.swap(Name);
           }
           Name = std::move(*MangledName);
-        } else if (ExportName.empty()) {
+        } else if (!E.Noname && ExportName.empty()) {
           NameType = IMPORT_NAME_EXPORTAS;
           ExportName = std::move(*getArm64ECDemangledFunctionName(Name));
         }
diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test
index 9ce53fe0fea077..e9987d0ca2e645 100644
--- a/llvm/test/tools/llvm-lib/arm64ec-implib.test
+++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test
@@ -14,6 +14,8 @@ ARMAP-NEXT: Archive EC map
 ARMAP-NEXT: #expname in test.dll
 ARMAP-NEXT: #funcexp in test.dll
 ARMAP-NEXT: #mangledfunc in test.dll
+ARMAP-NEXT: #manglednonamefunc in test.dll
+ARMAP-NEXT: #nonamefunc in test.dll
 ARMAP-NEXT: ?test_cpp_func@@$$hYAHPEAX at Z in test.dll
 ARMAP-NEXT: ?test_cpp_func@@YAHPEAX at Z in test.dll
 ARMAP-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
@@ -23,13 +25,19 @@ ARMAP-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX at Z in test.dll
 ARMAP-NEXT: __imp_aux_expname in test.dll
 ARMAP-NEXT: __imp_aux_funcexp in test.dll
 ARMAP-NEXT: __imp_aux_mangledfunc in test.dll
+ARMAP-NEXT: __imp_aux_manglednonamefunc in test.dll
+ARMAP-NEXT: __imp_aux_nonamefunc in test.dll
 ARMAP-NEXT: __imp_dataexp in test.dll
 ARMAP-NEXT: __imp_expname in test.dll
 ARMAP-NEXT: __imp_funcexp in test.dll
 ARMAP-NEXT: __imp_mangledfunc in test.dll
+ARMAP-NEXT: __imp_manglednonamefunc in test.dll
+ARMAP-NEXT: __imp_nonamefunc in test.dll
 ARMAP-NEXT: expname in test.dll
 ARMAP-NEXT: funcexp in test.dll
 ARMAP-NEXT: mangledfunc in test.dll
+ARMAP-NEXT: manglednonamefunc in test.dll
+ARMAP-NEXT: nonamefunc in test.dll
 ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
 
 RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s
@@ -95,6 +103,25 @@ READOBJ-NEXT: Type: data
 READOBJ-NEXT: Name type: name
 READOBJ-NEXT: Export name: dataexp
 READOBJ-NEXT: Symbol: __imp_dataexp
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: ordinal
+READOBJ-NEXT: Symbol: __imp_nonamefunc
+READOBJ-NEXT: Symbol: nonamefunc
+READOBJ-NEXT: Symbol: __imp_aux_nonamefunc
+READOBJ-NEXT: Symbol: #nonamefunc
+READOBJ-EMPTY:
+READOBJ-NEXT: File: test.dll
+READOBJ-NEXT: Format: COFF-import-file-ARM64EC
+READOBJ-NEXT: Type: code
+READOBJ-NEXT: Name type: ordinal
+READOBJ-NEXT: Symbol: __imp_manglednonamefunc
+READOBJ-NEXT: Symbol: manglednonamefunc
+READOBJ-NEXT: Symbol: __imp_aux_manglednonamefunc
+READOBJ-NEXT: Symbol: #manglednonamefunc
+
 
 Using -machine:arm64x gives the same output.
 RUN: llvm-lib -machine:arm64x -def:test.def -out:testx.lib
@@ -112,22 +139,28 @@ RUN: llvm-nm --print-armap testx.lib | FileCheck -check-prefix=ARMAPX %s
 
 ARMAPX:      Archive map
 ARMAPX-NEXT: #mangledfunc in test.dll
+ARMAPX-NEXT: #manglednonamefunc in test.dll
 ARMAPX-NEXT: ?test_cpp_func@@YAHPEAX at Z in test.dll
 ARMAPX-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
 ARMAPX-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
 ARMAPX-NEXT: __imp_#mangledfunc in test.dll
+ARMAPX-NEXT: __imp_#manglednonamefunc in test.dll
 ARMAPX-NEXT: __imp_?test_cpp_func@@YAHPEAX at Z in test.dll
 ARMAPX-NEXT: __imp_dataexp in test.dll
 ARMAPX-NEXT: __imp_expname in test.dll
 ARMAPX-NEXT: __imp_funcexp in test.dll
+ARMAPX-NEXT: __imp_nonamefunc in test.dll
 ARMAPX-NEXT: expname in test.dll
 ARMAPX-NEXT: funcexp in test.dll
+ARMAPX-NEXT: nonamefunc in test.dll
 ARMAPX-NEXT: test_NULL_THUNK_DATA in test.dll
 ARMAPX-EMPTY:
 ARMAPX-NEXT: Archive EC map
 ARMAPX-NEXT: #expname in test.dll
 ARMAPX-NEXT: #funcexp in test.dll
 ARMAPX-NEXT: #mangledfunc in test.dll
+ARMAPX-NEXT: #manglednonamefunc in test.dll
+ARMAPX-NEXT: #nonamefunc in test.dll
 ARMAPX-NEXT: ?test_cpp_func@@$$hYAHPEAX at Z in test.dll
 ARMAPX-NEXT: ?test_cpp_func@@YAHPEAX at Z in test.dll
 ARMAPX-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
@@ -137,13 +170,19 @@ ARMAPX-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX at Z in test.dll
 ARMAPX-NEXT: __imp_aux_expname in test.dll
 ARMAPX-NEXT: __imp_aux_funcexp in test.dll
 ARMAPX-NEXT: __imp_aux_mangledfunc in test.dll
+ARMAPX-NEXT: __imp_aux_manglednonamefunc in test.dll
+ARMAPX-NEXT: __imp_aux_nonamefunc in test.dll
 ARMAPX-NEXT: __imp_dataexp in test.dll
 ARMAPX-NEXT: __imp_expname in test.dll
 ARMAPX-NEXT: __imp_funcexp in test.dll
 ARMAPX-NEXT: __imp_mangledfunc in test.dll
+ARMAPX-NEXT: __imp_manglednonamefunc in test.dll
+ARMAPX-NEXT: __imp_nonamefunc in test.dll
 ARMAPX-NEXT: expname in test.dll
 ARMAPX-NEXT: funcexp in test.dll
 ARMAPX-NEXT: mangledfunc in test.dll
+ARMAPX-NEXT: manglednonamefunc in test.dll
+ARMAPX-NEXT: nonamefunc in test.dll
 ARMAPX-NEXT: test_NULL_THUNK_DATA in test.dll
 
 RUN: llvm-readobj testx.lib | FileCheck -check-prefix=READOBJX %s
@@ -211,6 +250,24 @@ READOBJX-NEXT: Export name: dataexp
 READOBJX-NEXT: Symbol: __imp_dataexp
 READOBJX-EMPTY:
 READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: ordinal
+READOBJX-NEXT: Symbol: __imp_nonamefunc
+READOBJX-NEXT: Symbol: nonamefunc
+READOBJX-NEXT: Symbol: __imp_aux_nonamefunc
+READOBJX-NEXT: Symbol: #nonamefunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64EC
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: ordinal
+READOBJX-NEXT: Symbol: __imp_manglednonamefunc
+READOBJX-NEXT: Symbol: manglednonamefunc
+READOBJX-NEXT: Symbol: __imp_aux_manglednonamefunc
+READOBJX-NEXT: Symbol: #manglednonamefunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
 READOBJX-NEXT: Format: COFF-import-file-ARM64
 READOBJX-NEXT: Type: code
 READOBJX-NEXT: Name type: name
@@ -248,6 +305,20 @@ READOBJX-NEXT: Type: data
 READOBJX-NEXT: Name type: name
 READOBJX-NEXT: Export name: dataexp
 READOBJX-NEXT: Symbol: __imp_dataexp
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: ordinal
+READOBJX-NEXT: Symbol: __imp_nonamefunc
+READOBJX-NEXT: Symbol: nonamefunc
+READOBJX-EMPTY:
+READOBJX-NEXT: File: test.dll
+READOBJX-NEXT: Format: COFF-import-file-ARM64
+READOBJX-NEXT: Type: code
+READOBJX-NEXT: Name type: ordinal
+READOBJX-NEXT: Symbol: __imp_#manglednonamefunc
+READOBJX-NEXT: Symbol: #manglednonamefunc
 
 
 RUN: llvm-lib -machine:arm64ec -def:test.def -defArm64Native:test2.def -out:test2.lib
@@ -266,6 +337,8 @@ ARMAPX2-NEXT: Archive EC map
 ARMAPX2-NEXT: #expname in test2.dll
 ARMAPX2-NEXT: #funcexp in test2.dll
 ARMAPX2-NEXT: #mangledfunc in test2.dll
+ARMAPX2-NEXT: #manglednonamefunc in test2.dll
+ARMAPX2-NEXT: #nonamefunc in test2.dll
 ARMAPX2-NEXT: ?test_cpp_func@@$$hYAHPEAX at Z in test2.dll
 ARMAPX2-NEXT: ?test_cpp_func@@YAHPEAX at Z in test2.dll
 ARMAPX2-NEXT: __IMPORT_DESCRIPTOR_test2 in test2.dll
@@ -275,13 +348,19 @@ ARMAPX2-NEXT: __imp_aux_?test_cpp_func@@YAHPEAX at Z in test2.dll
 ARMAPX2-NEXT: __imp_aux_expname in test2.dll
 ARMAPX2-NEXT: __imp_aux_funcexp in test2.dll
 ARMAPX2-NEXT: __imp_aux_mangledfunc in test2.dll
+ARMAPX2-NEXT: __imp_aux_manglednonamefunc in test2.dll
+ARMAPX2-NEXT: __imp_aux_nonamefunc in test2.dll
 ARMAPX2-NEXT: __imp_dataexp in test2.dll
 ARMAPX2-NEXT: __imp_expname in test2.dll
 ARMAPX2-NEXT: __imp_funcexp in test2.dll
 ARMAPX2-NEXT: __imp_mangledfunc in test2.dll
+ARMAPX2-NEXT: __imp_manglednonamefunc in test2.dll
+ARMAPX2-NEXT: __imp_nonamefunc in test2.dll
 ARMAPX2-NEXT: expname in test2.dll
 ARMAPX2-NEXT: funcexp in test2.dll
 ARMAPX2-NEXT: mangledfunc in test2.dll
+ARMAPX2-NEXT: manglednonamefunc in test2.dll
+ARMAPX2-NEXT: nonamefunc in test2.dll
 ARMAPX2-NEXT: test2_NULL_THUNK_DATA in test2.dll
 
 ARMAPX2:      test2.dll:
@@ -312,6 +391,18 @@ ARMAPX2-NEXT: test2.dll:
 ARMAPX2-NEXT: 00000000 D __imp_dataexp
 ARMAPX2-EMPTY:
 ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T #nonamefunc
+ARMAPX2-NEXT: 00000000 T __imp_aux_nonamefunc
+ARMAPX2-NEXT: 00000000 T __imp_nonamefunc
+ARMAPX2-NEXT: 00000000 T nonamefunc
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
+ARMAPX2-NEXT: 00000000 T #manglednonamefunc
+ARMAPX2-NEXT: 00000000 T __imp_aux_manglednonamefunc
+ARMAPX2-NEXT: 00000000 T __imp_manglednonamefunc
+ARMAPX2-NEXT: 00000000 T manglednonamefunc
+ARMAPX2-EMPTY:
+ARMAPX2-NEXT: test2.dll:
 ARMAPX2-NEXT: 00000000 T __imp_otherfunc
 ARMAPX2-NEXT: 00000000 T otherfunc
 
@@ -406,6 +497,8 @@ EXPORTS
     ?test_cpp_func@@YAHPEAX at Z
     expname=impname
     dataexp DATA
+    nonamefunc @1 NONAME
+    #manglednonamefunc @2 NONAME
 
 #--- test2.def
 LIBRARY test2.dll

>From ab7dba233a058cc8310ef829929238b5d8440b30 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Wed, 27 Mar 2024 13:41:34 +0200
Subject: [PATCH 10/54] [CodeGen][LLVM] Make the `va_list` related intrinsics
 generic. (#85460)

Currently, the builtins used for implementing `va_list` handling
unconditionally take their arguments as unqualified `ptr`s i.e. pointers
to AS 0. This does not work for targets where the default AS is not 0 or
AS 0 is not a viable AS (for example, a target might choose 0 to
represent the constant address space). This patch changes the builtins'
signature to take generic `anyptr` args, which corrects this issue. It
is noisy due to the number of tests affected. A test for an upstream
target which does not use 0 as its default AS (SPIRV for HIP device
compilations) is added as well.
---
 clang/lib/CodeGen/CGBuiltin.cpp               |  6 ++-
 clang/test/CodeGen/CSKY/csky-abi.c            | 16 +++----
 clang/test/CodeGen/LoongArch/abi-lp64d.c      |  4 +-
 .../test/CodeGen/PowerPC/aix-altivec-vaargs.c |  4 +-
 clang/test/CodeGen/PowerPC/aix-vaargs.c       | 14 +++---
 .../CodeGen/PowerPC/ppc64le-varargs-f128.c    | 18 +++----
 clang/test/CodeGen/RISCV/riscv32-vararg.c     | 40 ++++++++--------
 clang/test/CodeGen/RISCV/riscv64-vararg.c     | 16 +++----
 clang/test/CodeGen/WebAssembly/wasm-varargs.c | 16 +++----
 clang/test/CodeGen/X86/va-arg-sse.c           |  4 +-
 clang/test/CodeGen/X86/x86_64-vaarg.c         |  4 +-
 clang/test/CodeGen/aarch64-ABI-align-packed.c | 14 +++---
 clang/test/CodeGen/aarch64-varargs.c          |  2 +-
 clang/test/CodeGen/arm-varargs.c              |  2 +-
 clang/test/CodeGen/hexagon-linux-vararg.c     |  2 +-
 clang/test/CodeGen/mips-varargs.c             | 16 +++----
 clang/test/CodeGen/pr53127.cpp                |  4 +-
 ...rargs-with-nonzero-default-address-space.c | 46 ++++++++++++++++++
 clang/test/CodeGen/xcore-abi.c                |  2 +-
 clang/test/CodeGenCXX/ext-int.cpp             | 12 ++---
 clang/test/CodeGenCXX/ibm128-declarations.cpp |  4 +-
 clang/test/CodeGenCXX/x86_64-vaarg.cpp        |  4 +-
 clang/test/Modules/codegen.test               |  2 +-
 llvm/docs/LangRef.rst                         | 29 ++++++-----
 llvm/include/llvm/IR/Intrinsics.td            | 11 +++--
 llvm/test/Bitcode/compatibility-3.6.ll        | 16 +++----
 llvm/test/Bitcode/compatibility-3.7.ll        | 16 +++----
 llvm/test/Bitcode/compatibility-3.8.ll        | 16 +++----
 llvm/test/Bitcode/compatibility-3.9.ll        | 16 +++----
 llvm/test/Bitcode/compatibility-4.0.ll        | 16 +++----
 llvm/test/Bitcode/compatibility-5.0.ll        | 16 +++----
 llvm/test/Bitcode/compatibility-6.0.ll        | 16 +++----
 llvm/test/Bitcode/compatibility.ll            | 18 +++----
 llvm/test/Bitcode/thinlto-function-summary.ll |  6 +--
 .../Bitcode/variableArgumentIntrinsic.3.2.ll  |  8 ++--
 .../MemorySanitizer/AArch64/vararg_shadow.ll  | 48 +++++++++----------
 .../MemorySanitizer/SystemZ/vararg-kernel.ll  |  2 +-
 .../MemorySanitizer/X86/vararg_shadow.ll      | 48 +++++++++----------
 .../MemorySanitizer/msan_debug_info.ll        |  2 +-
 .../Transforms/GlobalOpt/inalloca-varargs.ll  |  2 +-
 .../Transforms/IROutliner/illegal-vaarg.ll    | 12 ++---
 .../IROutliner/outline-vaarg-intrinsic.ll     |  8 ++--
 llvm/test/Transforms/NewGVN/pr31483.ll        |  2 +-
 .../Transforms/Reassociate/vaarg_movable.ll   |  4 +-
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   |  6 +--
 mlir/test/Target/LLVMIR/Import/basic.ll       | 14 +++---
 mlir/test/Target/LLVMIR/Import/intrinsic.ll   | 12 ++---
 mlir/test/Target/LLVMIR/llvmir.mlir           |  8 ++--
 48 files changed, 330 insertions(+), 274 deletions(-)
 create mode 100644 clang/test/CodeGen/varargs-with-nonzero-default-address-space.c

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3cfdb261a0eac0..fdb517eb254d3b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -792,7 +792,8 @@ EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) {
 
 Value *CodeGenFunction::EmitVAStartEnd(Value *ArgValue, bool IsStart) {
   Intrinsic::ID inst = IsStart ? Intrinsic::vastart : Intrinsic::vaend;
-  return Builder.CreateCall(CGM.getIntrinsic(inst), ArgValue);
+  return Builder.CreateCall(CGM.getIntrinsic(inst, {ArgValue->getType()}),
+                            ArgValue);
 }
 
 /// Checks if using the result of __builtin_object_size(p, @p From) in place of
@@ -3018,7 +3019,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_va_copy: {
     Value *DstPtr = EmitVAListRef(E->getArg(0)).getPointer();
     Value *SrcPtr = EmitVAListRef(E->getArg(1)).getPointer();
-    Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy), {DstPtr, SrcPtr});
+    Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy, {DstPtr->getType()}),
+                       {DstPtr, SrcPtr});
     return RValue::get(nullptr);
   }
   case Builtin::BIabs:
diff --git a/clang/test/CodeGen/CSKY/csky-abi.c b/clang/test/CodeGen/CSKY/csky-abi.c
index 2e549376ba9330..29ed661aea75d9 100644
--- a/clang/test/CodeGen/CSKY/csky-abi.c
+++ b/clang/test/CodeGen/CSKY/csky-abi.c
@@ -185,13 +185,13 @@ void f_va_caller(void) {
 // CHECK:   [[VA:%.*]] = alloca ptr, align 4
 // CHECK:   [[V:%.*]] = alloca i32, align 4
 // CHECK:   store ptr %fmt, ptr [[FMT_ADDR]], align 4
-// CHECK:   call void @llvm.va_start(ptr [[VA]])
+// CHECK:   call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK:   [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK:   [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK:   store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK:   [[TMP1:%.*]] = load i32, ptr [[ARGP_CUR]], align 4
 // CHECK:   store i32 [[TMP1]], ptr [[V]], align 4
-// CHECK:   call void @llvm.va_end(ptr [[VA]])
+// CHECK:   call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK:   [[TMP2:%.*]] = load i32, ptr [[V]], align 4
 // CHECK:   ret i32 [[TMP2]]
 // CHECK: }
@@ -210,13 +210,13 @@ int f_va_1(char *fmt, ...) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca double, align 4
 // CHECK-NEXT:    store ptr [[FMT:%.*]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARGP_CUR]], align 4
 // CHECK-NEXT:    store double [[TMP4]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[V]], align 4
 // CHECK-NEXT:    ret double [[TMP5]]
 double f_va_2(char *fmt, ...) {
@@ -236,7 +236,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[X:%.*]] = alloca double, align 4
 // CHECK-NEXT:    store ptr [[FMT:%.*]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -252,7 +252,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT5]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP11:%.*]] = load double, ptr [[ARGP_CUR4]], align 4
 // CHECK-NEXT:    store double [[TMP11]], ptr [[X]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[V]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr [[X]], align 4
 // CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP12]], [[TMP13]]
@@ -279,7 +279,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
 // CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT:%.*]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -302,7 +302,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-NEXT:    [[ARGP_NEXT9:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR8]], i32 16
 // CHECK-NEXT:    store ptr [[ARGP_NEXT9]], ptr [[VA]], align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[LS]], ptr align 4 [[ARGP_CUR8]], i32 16, i1 false)
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 int f_va_4(char *fmt, ...) {
   __builtin_va_list va;
 
diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d.c b/clang/test/CodeGen/LoongArch/abi-lp64d.c
index 66b480a7f06894..fc7f1eada586b3 100644
--- a/clang/test/CodeGen/LoongArch/abi-lp64d.c
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d.c
@@ -449,13 +449,13 @@ void f_va_caller(void) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT:%.*]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i64 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 8
 // CHECK-NEXT:    store i32 [[TMP0]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 int f_va_int(char *fmt, ...) {
diff --git a/clang/test/CodeGen/PowerPC/aix-altivec-vaargs.c b/clang/test/CodeGen/PowerPC/aix-altivec-vaargs.c
index 03182423a422c1..b3f1e93b639440 100644
--- a/clang/test/CodeGen/PowerPC/aix-altivec-vaargs.c
+++ b/clang/test/CodeGen/PowerPC/aix-altivec-vaargs.c
@@ -17,7 +17,7 @@ vector double vector_varargs(int count, ...) {
 }
 
 // CHECK:         %arg_list = alloca ptr
-// CHECK:         call void @llvm.va_start(ptr %arg_list)
+// CHECK:         call void @llvm.va_start.p0(ptr %arg_list)
 
 // AIX32:       for.body:
 // AIX32-NEXT:    %argp.cur = load ptr, ptr %arg_list, align 4
@@ -41,4 +41,4 @@ vector double vector_varargs(int count, ...) {
 
 
 // CHECK:      for.end:
-// CHECK:        call void @llvm.va_end(ptr %arg_list)
+// CHECK:        call void @llvm.va_end.p0(ptr %arg_list)
diff --git a/clang/test/CodeGen/PowerPC/aix-vaargs.c b/clang/test/CodeGen/PowerPC/aix-vaargs.c
index 8b8417d315a504..724ba6560cdb97 100644
--- a/clang/test/CodeGen/PowerPC/aix-vaargs.c
+++ b/clang/test/CodeGen/PowerPC/aix-vaargs.c
@@ -35,7 +35,7 @@ void testva (int n, ...) {
 
 // CHECK-NEXT:  %v = alloca i32, align 4
 // CHECK-NEXT:  store i32 %n, ptr %n.addr, align 4
-// CHECK-NEXT:  call void @llvm.va_start(ptr %ap)
+// CHECK-NEXT:  call void @llvm.va_start.p0(ptr %ap)
 
 // AIX32-NEXT:  %argp.cur = load ptr, ptr %ap, align 4
 // AIX32-NEXT:  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 16
@@ -48,7 +48,7 @@ void testva (int n, ...) {
 // AIX32-NEXT:  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %t, ptr align 4 %argp.cur, i32 16, i1 false)
 // AIX64-NEXT:  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %t, ptr align 8 %argp.cur, i64 16, i1 false)
 
-// CHECK-NEXT:  call void @llvm.va_copy(ptr %ap2, ptr %ap)
+// CHECK-NEXT:  call void @llvm.va_copy.p0(ptr %ap2, ptr %ap)
 
 // AIX32-NEXT:  %argp.cur1 = load ptr, ptr %ap2, align 4
 // AIX32-NEXT:  %argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i32 4
@@ -62,14 +62,14 @@ void testva (int n, ...) {
 // AIX64-NEXT:  %1 = load i32, ptr %0, align 4
 // AIX64-NEXT:  store i32 %1, ptr %v, align 4
 
-// CHECK-NEXT:  call void @llvm.va_end(ptr %ap2)
-// CHECK-NEXT:  call void @llvm.va_end(ptr %ap)
+// CHECK-NEXT:  call void @llvm.va_end.p0(ptr %ap2)
+// CHECK-NEXT:  call void @llvm.va_end.p0(ptr %ap)
 // CHECK-NEXT:  ret void
 
-// CHECK: declare void @llvm.va_start(ptr)
+// CHECK: declare void @llvm.va_start.p0(ptr)
 
 // AIX32: declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg)
 // AIX64: declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-// CHECK: declare void @llvm.va_copy(ptr, ptr)
-// CHECK: declare void @llvm.va_end(ptr)
+// CHECK: declare void @llvm.va_copy.p0(ptr, ptr)
+// CHECK: declare void @llvm.va_end.p0(ptr)
diff --git a/clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c b/clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c
index 396614fe5bac2f..2f5459d1bb9c4c 100644
--- a/clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c
+++ b/clang/test/CodeGen/PowerPC/ppc64le-varargs-f128.c
@@ -31,7 +31,7 @@ void foo_ls(ldbl128_s);
 // OMP-TARGET: call void @foo_ld(ppc_fp128 noundef %[[V3]])
 
 // OMP-HOST-LABEL: define{{.*}} void @omp(
-// OMP-HOST: call void @llvm.va_start(ptr %[[AP:[0-9a-zA-Z_.]+]])
+// OMP-HOST: call void @llvm.va_start.p0(ptr %[[AP:[0-9a-zA-Z_.]+]])
 // OMP-HOST: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]], align 8
 // OMP-HOST: %[[TMP0:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i32 15
 // OMP-HOST: %[[ALIGN:[^ ]+]] = call ptr @llvm.ptrmask.p0.i64(ptr %[[TMP0]], i64 -16)
@@ -49,13 +49,13 @@ void omp(int n, ...) {
 }
 
 // IEEE-LABEL: define{{.*}} void @f128
-// IEEE: call void @llvm.va_start(ptr %[[AP:[0-9a-zA-Z_.]+]])
+// IEEE: call void @llvm.va_start.p0(ptr %[[AP:[0-9a-zA-Z_.]+]])
 // IEEE: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]]
 // IEEE: %[[TMP0:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i32 15
 // IEEE: %[[ALIGN:[^ ]+]] = call ptr @llvm.ptrmask.p0.i64(ptr %[[TMP0]], i64 -16)
 // IEEE: %[[V4:[0-9a-zA-Z_.]+]] = load fp128, ptr %[[ALIGN]], align 16
 // IEEE: call void @foo_fq(fp128 noundef %[[V4]])
-// IEEE: call void @llvm.va_end(ptr %[[AP]])
+// IEEE: call void @llvm.va_end.p0(ptr %[[AP]])
 void f128(int n, ...) {
   va_list ap;
   va_start(ap, n);
@@ -64,20 +64,20 @@ void f128(int n, ...) {
 }
 
 // IEEE-LABEL: define{{.*}} void @long_double
-// IEEE: call void @llvm.va_start(ptr %[[AP:[0-9a-zA-Z_.]+]])
+// IEEE: call void @llvm.va_start.p0(ptr %[[AP:[0-9a-zA-Z_.]+]])
 // IEEE: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]]
 // IEEE: %[[TMP0:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i32 15
 // IEEE: %[[ALIGN:[^ ]+]] = call ptr @llvm.ptrmask.p0.i64(ptr %[[TMP0]], i64 -16)
 // IEEE: %[[V4:[0-9a-zA-Z_.]+]] = load fp128, ptr %[[ALIGN]], align 16
 // IEEE: call void @foo_ld(fp128 noundef %[[V4]])
-// IEEE: call void @llvm.va_end(ptr %[[AP]])
+// IEEE: call void @llvm.va_end.p0(ptr %[[AP]])
 
 // IBM-LABEL: define{{.*}} void @long_double
-// IBM: call void @llvm.va_start(ptr  %[[AP:[0-9a-zA-Z_.]+]])
+// IBM: call void @llvm.va_start.p0(ptr  %[[AP:[0-9a-zA-Z_.]+]])
 // IBM: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]]
 // IBM: %[[V4:[0-9a-zA-Z_.]+]] = load ppc_fp128, ptr %[[CUR]], align 8
 // IBM: call void @foo_ld(ppc_fp128 noundef %[[V4]])
-// IBM: call void @llvm.va_end(ptr %[[AP]])
+// IBM: call void @llvm.va_end.p0(ptr %[[AP]])
 void long_double(int n, ...) {
   va_list ap;
   va_start(ap, n);
@@ -86,7 +86,7 @@ void long_double(int n, ...) {
 }
 
 // IEEE-LABEL: define{{.*}} void @long_double_struct
-// IEEE: call void @llvm.va_start(ptr %[[AP:[0-9a-zA-Z_.]+]])
+// IEEE: call void @llvm.va_start.p0(ptr %[[AP:[0-9a-zA-Z_.]+]])
 // IEEE: %[[CUR:[0-9a-zA-Z_.]+]] = load ptr, ptr %[[AP]]
 // IEEE: %[[TMP0:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i32 15
 // IEEE: %[[ALIGN:[^ ]+]] = call ptr @llvm.ptrmask.p0.i64(ptr %[[TMP0]], i64 -16)
@@ -96,7 +96,7 @@ void long_double(int n, ...) {
 // IEEE: %[[COERCE:[0-9a-zA-Z_.]+]] = getelementptr inbounds %struct.ldbl128_s, ptr %[[TMP]], i32 0, i32 0
 // IEEE: %[[V4:[0-9a-zA-Z_.]+]] = load fp128, ptr %[[COERCE]], align 16
 // IEEE: call void @foo_ls(fp128 inreg %[[V4]])
-// IEEE: call void @llvm.va_end(ptr %[[AP]])
+// IEEE: call void @llvm.va_end.p0(ptr %[[AP]])
 void long_double_struct(int n, ...) {
   va_list ap;
   va_start(ap, n);
diff --git a/clang/test/CodeGen/RISCV/riscv32-vararg.c b/clang/test/CodeGen/RISCV/riscv32-vararg.c
index 1c4e41f2f54c8f..00e04eb894675e 100644
--- a/clang/test/CodeGen/RISCV/riscv32-vararg.c
+++ b/clang/test/CodeGen/RISCV/riscv32-vararg.c
@@ -80,13 +80,13 @@ void f_va_caller(void) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 4
 // CHECK-NEXT:    store i32 [[TMP0]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -111,7 +111,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-ILP32F-NEXT:    [[V:%.*]] = alloca double, align 8
 // CHECK-ILP32F-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32F-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -119,7 +119,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARGP_CUR_ALIGNED]], align 8
 // CHECK-ILP32F-NEXT:    store double [[TMP1]], ptr [[V]], align 8
-// CHECK-ILP32F-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[TMP2:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32F-NEXT:    ret double [[TMP2]]
 //
@@ -130,7 +130,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-ILP32D-NEXT:    [[V:%.*]] = alloca double, align 8
 // CHECK-ILP32D-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32D-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -138,7 +138,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARGP_CUR_ALIGNED]], align 8
 // CHECK-ILP32D-NEXT:    store double [[TMP1]], ptr [[V]], align 8
-// CHECK-ILP32D-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[TMP2:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32D-NEXT:    ret double [[TMP2]]
 //
@@ -149,13 +149,13 @@ int f_va_1(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-ILP32E-NEXT:    [[V:%.*]] = alloca double, align 8
 // CHECK-ILP32E-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32E-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 8
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARGP_CUR]], align 4
 // CHECK-ILP32E-NEXT:    store double [[TMP0]], ptr [[V]], align 8
-// CHECK-ILP32E-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[TMP1:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32E-NEXT:    ret double [[TMP1]]
 //
@@ -180,7 +180,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-ILP32F-NEXT:    [[X:%.*]] = alloca double, align 8
 // CHECK-ILP32F-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32F-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -200,7 +200,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    store ptr [[ARGP_NEXT4]], ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARGP_CUR3_ALIGNED]], align 8
 // CHECK-ILP32F-NEXT:    store double [[TMP4]], ptr [[X]], align 8
-// CHECK-ILP32F-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[TMP5:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32F-NEXT:    [[TMP6:%.*]] = load double, ptr [[X]], align 8
 // CHECK-ILP32F-NEXT:    [[ADD:%.*]] = fadd double [[TMP5]], [[TMP6]]
@@ -215,7 +215,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-ILP32D-NEXT:    [[X:%.*]] = alloca double, align 8
 // CHECK-ILP32D-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32D-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -235,7 +235,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    store ptr [[ARGP_NEXT4]], ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARGP_CUR3_ALIGNED]], align 8
 // CHECK-ILP32D-NEXT:    store double [[TMP4]], ptr [[X]], align 8
-// CHECK-ILP32D-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[TMP5:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32D-NEXT:    [[TMP6:%.*]] = load double, ptr [[X]], align 8
 // CHECK-ILP32D-NEXT:    [[ADD:%.*]] = fadd double [[TMP5]], [[TMP6]]
@@ -250,7 +250,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-ILP32E-NEXT:    [[X:%.*]] = alloca double, align 8
 // CHECK-ILP32E-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32E-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 8
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -266,7 +266,7 @@ double f_va_2(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT4]], ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARGP_CUR3]], align 4
 // CHECK-ILP32E-NEXT:    store double [[TMP2]], ptr [[X]], align 8
-// CHECK-ILP32E-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[TMP3:%.*]] = load double, ptr [[V]], align 8
 // CHECK-ILP32E-NEXT:    [[TMP4:%.*]] = load double, ptr [[X]], align 8
 // CHECK-ILP32E-NEXT:    [[ADD:%.*]] = fadd double [[TMP3]], [[TMP4]]
@@ -296,7 +296,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
 // CHECK-ILP32F-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-ILP32F-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32F-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-ILP32F-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -321,7 +321,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32F-NEXT:    store ptr [[ARGP_NEXT8]], ptr [[VA]], align 4
 // CHECK-ILP32F-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGP_CUR7]], align 4
 // CHECK-ILP32F-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[LS]], ptr align 4 [[TMP3]], i32 16, i1 false)
-// CHECK-ILP32F-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32F-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-ILP32F-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to fp128
 // CHECK-ILP32F-NEXT:    [[TMP5:%.*]] = load fp128, ptr [[LD]], align 16
@@ -384,7 +384,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
 // CHECK-ILP32D-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-ILP32D-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32D-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-ILP32D-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -409,7 +409,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32D-NEXT:    store ptr [[ARGP_NEXT8]], ptr [[VA]], align 4
 // CHECK-ILP32D-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGP_CUR7]], align 4
 // CHECK-ILP32D-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[LS]], ptr align 4 [[TMP3]], i32 16, i1 false)
-// CHECK-ILP32D-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32D-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32D-NEXT:    [[TMP4:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-ILP32D-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to fp128
 // CHECK-ILP32D-NEXT:    [[TMP5:%.*]] = load fp128, ptr [[LD]], align 16
@@ -472,7 +472,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 4
 // CHECK-ILP32E-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-ILP32E-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-ILP32E-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -497,7 +497,7 @@ double f_va_3(char *fmt, ...) {
 // CHECK-ILP32E-NEXT:    store ptr [[ARGP_NEXT8]], ptr [[VA]], align 4
 // CHECK-ILP32E-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGP_CUR7]], align 4
 // CHECK-ILP32E-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[LS]], ptr align 4 [[TMP3]], i32 16, i1 false)
-// CHECK-ILP32E-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-ILP32E-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-ILP32E-NEXT:    [[TMP4:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-ILP32E-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP4]] to fp128
 // CHECK-ILP32E-NEXT:    [[TMP5:%.*]] = load fp128, ptr [[LD]], align 16
diff --git a/clang/test/CodeGen/RISCV/riscv64-vararg.c b/clang/test/CodeGen/RISCV/riscv64-vararg.c
index 634cde61320cb6..efdffa2687e624 100644
--- a/clang/test/CodeGen/RISCV/riscv64-vararg.c
+++ b/clang/test/CodeGen/RISCV/riscv64-vararg.c
@@ -135,13 +135,13 @@ void f_va_caller(void) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i64 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 8
 // CHECK-NEXT:    store i32 [[TMP0]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -166,7 +166,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[V:%.*]] = alloca fp128, align 16
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 15
 // CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -16)
@@ -174,7 +174,7 @@ int f_va_1(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load fp128, ptr [[ARGP_CUR_ALIGNED]], align 16
 // CHECK-NEXT:    store fp128 [[TMP1]], ptr [[V]], align 16
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load fp128, ptr [[V]], align 16
 // CHECK-NEXT:    ret fp128 [[TMP2]]
 //
@@ -199,7 +199,7 @@ long double f_va_2(char *fmt, ...) {
 // CHECK-NEXT:    [[W:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[X:%.*]] = alloca fp128, align 16
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 15
 // CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[TMP0]], i64 -16)
@@ -219,7 +219,7 @@ long double f_va_2(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT4]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load fp128, ptr [[ARGP_CUR3_ALIGNED]], align 16
 // CHECK-NEXT:    store fp128 [[TMP4]], ptr [[X]], align 16
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP5:%.*]] = load fp128, ptr [[V]], align 16
 // CHECK-NEXT:    [[TMP6:%.*]] = load fp128, ptr [[X]], align 16
 // CHECK-NEXT:    [[ADD:%.*]] = fadd fp128 [[TMP5]], [[TMP6]]
@@ -248,7 +248,7 @@ long double f_va_3(char *fmt, ...) {
 // CHECK-NEXT:    [[LS:%.*]] = alloca [[STRUCT_LARGE:%.*]], align 8
 // CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 8
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i64 8
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 8
@@ -267,7 +267,7 @@ long double f_va_3(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT6]], ptr [[VA]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGP_CUR5]], align 8
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[LS]], ptr align 8 [[TMP1]], i64 32, i1 false)
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_TINY]], ptr [[TS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A]], align 2
 // CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i64
diff --git a/clang/test/CodeGen/WebAssembly/wasm-varargs.c b/clang/test/CodeGen/WebAssembly/wasm-varargs.c
index c475de19ae4487..e794857304e1c9 100644
--- a/clang/test/CodeGen/WebAssembly/wasm-varargs.c
+++ b/clang/test/CodeGen/WebAssembly/wasm-varargs.c
@@ -10,13 +10,13 @@
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 4
 // CHECK-NEXT:    store i32 [[TMP0]], ptr [[V]], align 4
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -38,7 +38,7 @@ int test_i32(char *fmt, ...) {
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[V:%.*]] = alloca i64, align 8
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7
 // CHECK-NEXT:    [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8)
@@ -46,7 +46,7 @@ int test_i32(char *fmt, ...) {
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARGP_CUR_ALIGNED]], align 8
 // CHECK-NEXT:    store i64 [[TMP1]], ptr [[V]], align 8
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[V]], align 8
 // CHECK-NEXT:    ret i64 [[TMP2]]
 //
@@ -73,13 +73,13 @@ struct S {
 // CHECK-NEXT:    [[FMT_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGP_CUR]], align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_RESULT]], ptr align 4 [[TMP0]], i32 12, i1 false)
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    ret void
 //
 struct S test_struct(char *fmt, ...) {
@@ -102,7 +102,7 @@ struct Z {};
 // CHECK-NEXT:    [[VA:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    [[U:%.*]] = alloca [[STRUCT_Z:%.*]], align 1
 // CHECK-NEXT:    store ptr [[FMT]], ptr [[FMT_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
 // CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[VA]], align 4
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 0
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[VA]], align 4
@@ -112,7 +112,7 @@ struct Z {};
 // CHECK-NEXT:    store ptr [[ARGP_NEXT2]], ptr [[VA]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGP_CUR1]], align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_RESULT]], ptr align 4 [[TMP0]], i32 12, i1 false)
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[VA]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VA]])
 // CHECK-NEXT:    ret void
 //
 struct S test_empty_struct(char *fmt, ...) {
diff --git a/clang/test/CodeGen/X86/va-arg-sse.c b/clang/test/CodeGen/X86/va-arg-sse.c
index e040b0e5790bd1..b7d00dad1453d3 100644
--- a/clang/test/CodeGen/X86/va-arg-sse.c
+++ b/clang/test/CodeGen/X86/va-arg-sse.c
@@ -21,7 +21,7 @@ struct S a[5];
 // CHECK-NEXT:    store i32 0, ptr [[J]], align 4
 // CHECK-NEXT:    store i32 0, ptr [[K]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[AP]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    store ptr getelementptr inbounds ([5 x %struct.S], ptr @a, i64 0, i64 2), ptr [[P]], align 8
 // CHECK-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[AP]], i64 0, i64 0
 // CHECK-NEXT:    [[FP_OFFSET_P:%.*]] = getelementptr inbounds [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY2]], i32 0, i32 1
@@ -52,7 +52,7 @@ struct S a[5];
 // CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARG]], ptr align 4 [[VAARG_ADDR]], i64 12, i1 false)
 // CHECK-NEXT:    [[ARRAYDECAY3:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[AP]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_end(ptr [[ARRAYDECAY3]])
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[ARRAYDECAY3]])
 // CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[P]], align 8
 // CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne ptr [[TMP15]], null
 // CHECK-NEXT:    br i1 [[TOBOOL]], label [[LAND_LHS_TRUE:%.*]], label [[IF_END:%.*]]
diff --git a/clang/test/CodeGen/X86/x86_64-vaarg.c b/clang/test/CodeGen/X86/x86_64-vaarg.c
index a18ba836423881..07c6df14a0b812 100644
--- a/clang/test/CodeGen/X86/x86_64-vaarg.c
+++ b/clang/test/CodeGen/X86/x86_64-vaarg.c
@@ -13,7 +13,7 @@ typedef struct { struct {} a; } empty;
 // CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_EMPTY]], align 1
 // CHECK-NEXT:    store i32 [[Z]], ptr [[Z_ADDR]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL]], ptr align 1 [[TMP]], i64 0, i1 false)
 // CHECK-NEXT:    ret void
@@ -37,7 +37,7 @@ typedef struct {
 // CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
 // CHECK-NEXT:    store i32 [[Z]], ptr [[Z_ADDR]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
 // CHECK-NEXT:    [[FP_OFFSET_P:%.*]] = getelementptr inbounds [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY1]], i32 0, i32 1
 // CHECK-NEXT:    [[FP_OFFSET:%.*]] = load i32, ptr [[FP_OFFSET_P]], align 4
diff --git a/clang/test/CodeGen/aarch64-ABI-align-packed.c b/clang/test/CodeGen/aarch64-ABI-align-packed.c
index 2b029f64589567..13c68fe54b849f 100644
--- a/clang/test/CodeGen/aarch64-ABI-align-packed.c
+++ b/clang/test/CodeGen/aarch64-ABI-align-packed.c
@@ -73,7 +73,7 @@ __attribute__((noinline)) void named_arg_non_packed_struct(double d0, double d1,
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6:[0-9]+]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_non_packed_struct(double d0, double d1, double d2, double d3,
@@ -128,7 +128,7 @@ __attribute__((noinline)) void named_arg_packed_struct(double d0, double d1, dou
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_packed_struct(double d0, double d1, double d2, double d3,
@@ -183,7 +183,7 @@ __attribute__((noinline)) void named_arg_packed_member(double d0, double d1, dou
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_packed_member(double d0, double d1, double d2, double d3,
@@ -238,7 +238,7 @@ __attribute__((noinline)) void named_arg_aligned_struct_8(double d0, double d1,
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_aligned_struct_8(double d0, double d1, double d2, double d3,
@@ -293,7 +293,7 @@ __attribute__((noinline)) void named_arg_aligned_member_8(double d0, double d1,
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_aligned_member_8(double d0, double d1, double d2, double d3,
@@ -348,7 +348,7 @@ __attribute__((noinline)) void named_arg_pragma_packed_struct_8(double d0, doubl
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_pragma_packed_struct_8(double d0, double d1, double d2, double d3,
@@ -403,7 +403,7 @@ __attribute__((noinline)) void named_arg_pragma_packed_struct_4(double d0, doubl
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VL:%.*]] = alloca [[STRUCT___VA_LIST:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
-// CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[VL]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[VL]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VL]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 void variadic_pragma_packed_struct_4(double d0, double d1, double d2, double d3,
diff --git a/clang/test/CodeGen/aarch64-varargs.c b/clang/test/CodeGen/aarch64-varargs.c
index 44b87029e7b3d3..ee4e88eda4ef44 100644
--- a/clang/test/CodeGen/aarch64-varargs.c
+++ b/clang/test/CodeGen/aarch64-varargs.c
@@ -837,7 +837,7 @@ void check_start(int n, ...) {
   va_list the_list;
   va_start(the_list, n);
 // CHECK: [[THE_LIST:%[a-z_0-9]+]] = alloca %struct.__va_list
-// CHECK: call void @llvm.va_start(ptr [[THE_LIST]])
+// CHECK: call void @llvm.va_start.p0(ptr [[THE_LIST]])
 }
 
 typedef struct {} empty;
diff --git a/clang/test/CodeGen/arm-varargs.c b/clang/test/CodeGen/arm-varargs.c
index f754c7f52e5904..ab4ac46924e605 100644
--- a/clang/test/CodeGen/arm-varargs.c
+++ b/clang/test/CodeGen/arm-varargs.c
@@ -264,5 +264,5 @@ void check_start(int n, ...) {
   va_list the_list;
   va_start(the_list, n);
 // CHECK: [[THE_LIST:%[a-z0-9._]+]] = alloca %struct.__va_list
-// CHECK: call void @llvm.va_start(ptr [[THE_LIST]])
+// CHECK: call void @llvm.va_start.p0(ptr [[THE_LIST]])
 }
diff --git a/clang/test/CodeGen/hexagon-linux-vararg.c b/clang/test/CodeGen/hexagon-linux-vararg.c
index 033e72ab449d31..84945e872d28bc 100644
--- a/clang/test/CodeGen/hexagon-linux-vararg.c
+++ b/clang/test/CodeGen/hexagon-linux-vararg.c
@@ -9,7 +9,7 @@ struct AAA {
   int d;
 };
 
-// CHECK:   call void @llvm.va_start(ptr %arraydecay)
+// CHECK:   call void @llvm.va_start.p0(ptr %arraydecay)
 // CHECK:   %arraydecay1 = getelementptr inbounds [1 x %struct.__va_list_tag],
 // ptr %ap, i32 0, i32 0
 // CHECK:   br label %vaarg.maybe_reg
diff --git a/clang/test/CodeGen/mips-varargs.c b/clang/test/CodeGen/mips-varargs.c
index 052aedd1cd1e2c..029f000c121a5b 100644
--- a/clang/test/CodeGen/mips-varargs.c
+++ b/clang/test/CodeGen/mips-varargs.c
@@ -29,7 +29,7 @@ int test_i32(char *fmt, ...) {
 // ALL:   [[V:%.*]] = alloca i32, align 4
 // NEW:   [[PROMOTION_TEMP:%.*]] = alloca i32, align 4
 //
-// ALL:   call void @llvm.va_start(ptr %va)
+// ALL:   call void @llvm.va_start.p0(ptr %va)
 // ALL:   [[AP_CUR:%.+]] = load ptr, ptr %va, align [[$PTRALIGN]]
 // O32:   [[AP_NEXT:%.+]] = getelementptr inbounds i8, ptr [[AP_CUR]], [[$INTPTR_T:i32]] [[$CHUNKSIZE:4]]
 // NEW:   [[AP_NEXT:%.+]] = getelementptr inbounds i8, ptr [[AP_CUR]], [[$INTPTR_T:i32|i64]] [[$CHUNKSIZE:8]]
@@ -45,7 +45,7 @@ int test_i32(char *fmt, ...) {
 // NEW:   [[ARG:%.+]] = load i32, ptr [[PROMOTION_TEMP]], align 4
 // ALL:   store i32 [[ARG]], ptr [[V]], align 4
 //
-// ALL:   call void @llvm.va_end(ptr %va)
+// ALL:   call void @llvm.va_end.p0(ptr %va)
 // ALL: }
 
 long long test_i64(char *fmt, ...) {
@@ -61,7 +61,7 @@ long long test_i64(char *fmt, ...) {
 // ALL-LABEL: define{{.*}} i64 @test_i64(ptr{{.*}} %fmt, ...)
 //
 // ALL:   %va = alloca ptr, align [[$PTRALIGN]]
-// ALL:   call void @llvm.va_start(ptr %va)
+// ALL:   call void @llvm.va_start.p0(ptr %va)
 // ALL:   [[AP_CUR:%.+]] = load ptr, ptr %va, align [[$PTRALIGN]]
 //
 // i64 is 8-byte aligned, while this is within O32's stack alignment there's no
@@ -74,7 +74,7 @@ long long test_i64(char *fmt, ...) {
 //
 // ALL:   [[ARG:%.+]] = load i64, ptr [[AP_CUR]], align 8
 //
-// ALL:   call void @llvm.va_end(ptr %va)
+// ALL:   call void @llvm.va_end.p0(ptr %va)
 // ALL: }
 
 char *test_ptr(char *fmt, ...) {
@@ -92,7 +92,7 @@ char *test_ptr(char *fmt, ...) {
 // ALL:   %va = alloca ptr, align [[$PTRALIGN]]
 // ALL:   [[V:%.*]] = alloca ptr, align [[$PTRALIGN]]
 // N32:   [[AP_CAST:%.+]] = alloca ptr, align 4
-// ALL:   call void @llvm.va_start(ptr %va)
+// ALL:   call void @llvm.va_start.p0(ptr %va)
 // ALL:   [[AP_CUR:%.+]] = load ptr, ptr %va, align [[$PTRALIGN]]
 // ALL:   [[AP_NEXT:%.+]] = getelementptr inbounds i8, ptr [[AP_CUR]], [[$INTPTR_T]] [[$CHUNKSIZE]]
 // ALL:   store ptr [[AP_NEXT]], ptr %va, align [[$PTRALIGN]]
@@ -109,7 +109,7 @@ char *test_ptr(char *fmt, ...) {
 // N64:   [[ARG:%.+]] = load ptr, ptr [[AP_CUR]], align [[$PTRALIGN]]
 // ALL:   store ptr [[ARG]], ptr [[V]], align [[$PTRALIGN]]
 //
-// ALL:   call void @llvm.va_end(ptr %va)
+// ALL:   call void @llvm.va_end.p0(ptr %va)
 // ALL: }
 
 int test_v4i32(char *fmt, ...) {
@@ -128,7 +128,7 @@ int test_v4i32(char *fmt, ...) {
 //
 // ALL:   %va = alloca ptr, align [[$PTRALIGN]]
 // ALL:   [[V:%.+]] = alloca <4 x i32>, align 16
-// ALL:   call void @llvm.va_start(ptr %va)
+// ALL:   call void @llvm.va_start.p0(ptr %va)
 // ALL:   [[AP_CUR:%.+]] = load ptr, ptr %va, align [[$PTRALIGN]]
 //
 // Vectors are 16-byte aligned, however the O32 ABI has a maximum alignment of
@@ -152,7 +152,7 @@ int test_v4i32(char *fmt, ...) {
 // N32:   [[ARG:%.+]] = load <4 x i32>, ptr [[AP_CUR]], align 16
 // ALL:   store <4 x i32> [[ARG]], ptr [[V]], align 16
 //
-// ALL:   call void @llvm.va_end(ptr %va)
+// ALL:   call void @llvm.va_end.p0(ptr %va)
 // ALL:   [[VECEXT:%.+]] = extractelement <4 x i32> {{.*}}, i32 0
 // ALL:   ret i32 [[VECEXT]]
 // ALL: }
diff --git a/clang/test/CodeGen/pr53127.cpp b/clang/test/CodeGen/pr53127.cpp
index 97fe1291352d3c..5a52b4860eecdd 100644
--- a/clang/test/CodeGen/pr53127.cpp
+++ b/clang/test/CodeGen/pr53127.cpp
@@ -34,7 +34,7 @@ void operator delete(void*);
 // CHECK-NEXT:    br i1 [[CALL6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]]
 // CHECK:       cond.true7:
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[L]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    br label [[COND_END9:%.*]]
 // CHECK:       cond.false8:
 // CHECK-NEXT:    br label [[COND_END9]]
@@ -44,7 +44,7 @@ void operator delete(void*);
 // CHECK:       cond.true11:
 // CHECK-NEXT:    [[ARRAYDECAY12:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[L]], i64 0, i64 0
 // CHECK-NEXT:    [[ARRAYDECAY13:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[L2]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_copy(ptr [[ARRAYDECAY12]], ptr [[ARRAYDECAY13]])
+// CHECK-NEXT:    call void @llvm.va_copy.p0(ptr [[ARRAYDECAY12]], ptr [[ARRAYDECAY13]])
 // CHECK-NEXT:    br label [[COND_END15:%.*]]
 // CHECK:       cond.false14:
 // CHECK-NEXT:    br label [[COND_END15]]
diff --git a/clang/test/CodeGen/varargs-with-nonzero-default-address-space.c b/clang/test/CodeGen/varargs-with-nonzero-default-address-space.c
new file mode 100644
index 00000000000000..b087da34c3dfb5
--- /dev/null
+++ b/clang/test/CodeGen/varargs-with-nonzero-default-address-space.c
@@ -0,0 +1,46 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple spirv64-unknown-unknown -fcuda-is-device -emit-llvm -o - %s | FileCheck %s
+
+struct x {
+  double b;
+  long a;
+};
+
+// CHECK-LABEL: define spir_func void @testva(
+// CHECK-SAME: i32 noundef [[N:%.*]], ...) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AP:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-NEXT:    [[T:%.*]] = alloca [[STRUCT_X:%.*]], align 8
+// CHECK-NEXT:    [[AP2:%.*]] = alloca ptr addrspace(4), align 8
+// CHECK-NEXT:    [[V:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[VARET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr [[N_ADDR]] to ptr addrspace(4)
+// CHECK-NEXT:    [[AP_ASCAST:%.*]] = addrspacecast ptr [[AP]] to ptr addrspace(4)
+// CHECK-NEXT:    [[T_ASCAST:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4)
+// CHECK-NEXT:    [[AP2_ASCAST:%.*]] = addrspacecast ptr [[AP2]] to ptr addrspace(4)
+// CHECK-NEXT:    [[V_ASCAST:%.*]] = addrspacecast ptr [[V]] to ptr addrspace(4)
+// CHECK-NEXT:    [[VARET_ASCAST:%.*]] = addrspacecast ptr [[VARET]] to ptr addrspace(4)
+// CHECK-NEXT:    store i32 [[N]], ptr addrspace(4) [[N_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    call void @llvm.va_start.p4(ptr addrspace(4) [[AP_ASCAST]])
+// CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr addrspace(4) [[AP_ASCAST]], ptr
+// CHECK-NEXT:    call void @llvm.memcpy.p4.p0.i64(ptr addrspace(4) align 8 [[T_ASCAST]], ptr align 8 [[TMP0]], i64 16, i1 false)
+// CHECK-NEXT:    call void @llvm.va_copy.p4(ptr addrspace(4) [[AP2_ASCAST]], ptr addrspace(4) [[AP_ASCAST]])
+// CHECK-NEXT:    [[TMP1:%.*]] = va_arg ptr addrspace(4) [[AP2_ASCAST]], i32
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[VARET_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[VARET_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(4) [[V_ASCAST]], align 4
+// CHECK-NEXT:    call void @llvm.va_end.p4(ptr addrspace(4) [[AP2_ASCAST]])
+// CHECK-NEXT:    call void @llvm.va_end.p4(ptr addrspace(4) [[AP_ASCAST]])
+// CHECK-NEXT:    ret void
+
+void testva(int n, ...) {
+  __builtin_va_list ap;
+  __builtin_va_start(ap, n);
+  struct x t = __builtin_va_arg(ap, struct x);
+  __builtin_va_list ap2;
+  __builtin_va_copy(ap2, ap);
+  int v = __builtin_va_arg(ap2, int);
+  __builtin_va_end(ap2);
+  __builtin_va_end(ap);
+}
diff --git a/clang/test/CodeGen/xcore-abi.c b/clang/test/CodeGen/xcore-abi.c
index 4dd0f221533b94..bb8d2fec46bdb2 100644
--- a/clang/test/CodeGen/xcore-abi.c
+++ b/clang/test/CodeGen/xcore-abi.c
@@ -28,7 +28,7 @@ void testva (int n, ...) {
   // CHECK: [[AP:%[a-z0-9]+]] = alloca ptr, align 4
   // CHECK: [[V5:%[a-z0-9]+]] = alloca %struct.x, align 4
   // CHECK: [[TMP:%[a-z0-9]+]] = alloca [4 x i32], align 4
-  // CHECK: call void @llvm.va_start(ptr [[AP]])
+  // CHECK: call void @llvm.va_start.p0(ptr [[AP]])
 
   char* v1 = va_arg (ap, char*);
   f(v1);
diff --git a/clang/test/CodeGenCXX/ext-int.cpp b/clang/test/CodeGenCXX/ext-int.cpp
index 5a4270aef28542..a1d17c840ee460 100644
--- a/clang/test/CodeGenCXX/ext-int.cpp
+++ b/clang/test/CodeGenCXX/ext-int.cpp
@@ -159,9 +159,9 @@ void TakesVarargs(int i, ...) {
   // WIN: %[[ARGS:.+]] = alloca ptr
   __builtin_va_start(args, i);
   // LIN64: %[[STARTAD:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr %[[ARGS]]
-  // LIN64: call void @llvm.va_start(ptr %[[STARTAD]])
-  // LIN32: call void @llvm.va_start(ptr %[[ARGS]])
-  // WIN: call void @llvm.va_start(ptr %[[ARGS]])
+  // LIN64: call void @llvm.va_start.p0(ptr %[[STARTAD]])
+  // LIN32: call void @llvm.va_start.p0(ptr %[[ARGS]])
+  // WIN: call void @llvm.va_start.p0(ptr %[[ARGS]])
 
   _BitInt(92) A = __builtin_va_arg(args, _BitInt(92));
   // LIN64: %[[AD1:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr %[[ARGS]]
@@ -302,9 +302,9 @@ void TakesVarargs(int i, ...) {
 
   __builtin_va_end(args);
   // LIN64: %[[ENDAD:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr %[[ARGS]]
-  // LIN64: call void @llvm.va_end(ptr %[[ENDAD]])
-  // LIN32: call void @llvm.va_end(ptr %[[ARGS]])
-  // WIN: call void @llvm.va_end(ptr %[[ARGS]])
+  // LIN64: call void @llvm.va_end.p0(ptr %[[ENDAD]])
+  // LIN32: call void @llvm.va_end.p0(ptr %[[ARGS]])
+  // WIN: call void @llvm.va_end.p0(ptr %[[ARGS]])
 }
 void typeid_tests() {
   // LIN: define{{.*}} void @_Z12typeid_testsv()
diff --git a/clang/test/CodeGenCXX/ibm128-declarations.cpp b/clang/test/CodeGenCXX/ibm128-declarations.cpp
index 5ee4f354d37957..e0187e20cde423 100644
--- a/clang/test/CodeGenCXX/ibm128-declarations.cpp
+++ b/clang/test/CodeGenCXX/ibm128-declarations.cpp
@@ -107,13 +107,13 @@ int main(void) {
 // CHECK: define dso_local noundef ppc_fp128 @_Z10func_vaargiz(i32 noundef signext %n, ...)
 // CHECK: entry:
 // CHECK:   store i32 %n, ptr %n.addr, align 4
-// CHECK:   call void @llvm.va_start(ptr %ap)
+// CHECK:   call void @llvm.va_start.p0(ptr %ap)
 // CHECK:   %argp.cur = load ptr, ptr %ap, align 8
 // CHECK:   %argp.next = getelementptr inbounds i8, ptr %argp.cur, i64 16
 // CHECK:   store ptr %argp.next, ptr %ap, align 8
 // CHECK:   %0 = load ppc_fp128, ptr %argp.cur, align 8
 // CHECK:   store ppc_fp128 %0, ptr %r, align 16
-// CHECK:   call void @llvm.va_end(ptr %ap)
+// CHECK:   call void @llvm.va_end.p0(ptr %ap)
 // CHECK:   %1 = load ppc_fp128, ptr %r, align 16
 // CHECK:   ret ppc_fp128 %1
 // CHECK: }
diff --git a/clang/test/CodeGenCXX/x86_64-vaarg.cpp b/clang/test/CodeGenCXX/x86_64-vaarg.cpp
index d221c1881d3672..985a0cc41a1410 100644
--- a/clang/test/CodeGenCXX/x86_64-vaarg.cpp
+++ b/clang/test/CodeGenCXX/x86_64-vaarg.cpp
@@ -11,7 +11,7 @@ typedef struct { struct {} a; } empty;
 // CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_EMPTY]], align 1
 // CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL]], ptr align 1 [[TMP]], i64 1, i1 false)
 // CHECK-NEXT:    ret void
@@ -34,7 +34,7 @@ typedef struct {
 // CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
 // CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
-// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
 // CHECK-NEXT:    [[FP_OFFSET_P:%.*]] = getelementptr inbounds [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY1]], i32 0, i32 1
 // CHECK-NEXT:    [[FP_OFFSET:%.*]] = load i32, ptr [[FP_OFFSET_P]], align 4
diff --git a/clang/test/Modules/codegen.test b/clang/test/Modules/codegen.test
index 77602056defd4e..0af630a7548056 100644
--- a/clang/test/Modules/codegen.test
+++ b/clang/test/Modules/codegen.test
@@ -26,7 +26,7 @@ USE: $_Z4instIiEvv = comdat any
 USE: $_Z10always_inlv = comdat any
 FOO: $_ZN13implicit_dtorD2Ev = comdat any
 FOO: define weak_odr void @_Z2f1PKcz(ptr noundef %fmt, ...) #{{[0-9]+}} comdat
-FOO:   call void @llvm.va_start(ptr %{{[a-zA-Z0-9]*}})
+FOO:   call void @llvm.va_start.p0(ptr %{{[a-zA-Z0-9]*}})
 
 Test that implicit special members are emitted into the FOO module if they're
 ODR used there, otherwise emit them linkonce_odr as usual in the use.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 030b76935c7b4e..a4be31576931cd 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -12815,10 +12815,11 @@ Variable argument support is defined in LLVM with the
 functions. These functions are related to the similarly named macros
 defined in the ``<stdarg.h>`` header file.
 
-All of these functions operate on arguments that use a target-specific
+All of these functions take as arguments pointers to a target-specific
 value type "``va_list``". The LLVM assembly language reference manual
 does not define what this type is, so all transformations should be
-prepared to handle these functions regardless of the type used.
+prepared to handle these functions regardless of the type used. The intrinsics
+are overloaded, and can be used for pointers to different address spaces.
 
 This example shows how the :ref:`va_arg <i_va_arg>` instruction and the
 variable argument handling intrinsic functions are used.
@@ -12835,24 +12836,24 @@ variable argument handling intrinsic functions are used.
     define i32 @test(i32 %X, ...) {
       ; Initialize variable argument processing
       %ap = alloca %struct.va_list
-      call void @llvm.va_start(ptr %ap)
+      call void @llvm.va_start.p0(ptr %ap)
 
       ; Read a single integer argument
       %tmp = va_arg ptr %ap, i32
 
       ; Demonstrate usage of llvm.va_copy and llvm.va_end
       %aq = alloca ptr
-      call void @llvm.va_copy(ptr %aq, ptr %ap)
-      call void @llvm.va_end(ptr %aq)
+      call void @llvm.va_copy.p0(ptr %aq, ptr %ap)
+      call void @llvm.va_end.p0(ptr %aq)
 
       ; Stop processing of arguments.
-      call void @llvm.va_end(ptr %ap)
+      call void @llvm.va_end.p0(ptr %ap)
       ret i32 %tmp
     }
 
-    declare void @llvm.va_start(ptr)
-    declare void @llvm.va_copy(ptr, ptr)
-    declare void @llvm.va_end(ptr)
+    declare void @llvm.va_start.p0(ptr)
+    declare void @llvm.va_copy.p0(ptr, ptr)
+    declare void @llvm.va_end.p0(ptr)
 
 .. _int_va_start:
 
@@ -12864,7 +12865,8 @@ Syntax:
 
 ::
 
-      declare void @llvm.va_start(ptr <arglist>)
+      declare void @llvm.va_start.p0(ptr <arglist>)
+      declare void @llvm.va_start.p5(ptr addrspace(5) <arglist>)
 
 Overview:
 """""""""
@@ -12896,7 +12898,8 @@ Syntax:
 
 ::
 
-      declare void @llvm.va_end(ptr <arglist>)
+      declare void @llvm.va_end.p0(ptr <arglist>)
+      declare void @llvm.va_end.p5(ptr addrspace(5) <arglist>)
 
 Overview:
 """""""""
@@ -12929,7 +12932,8 @@ Syntax:
 
 ::
 
-      declare void @llvm.va_copy(ptr <destarglist>, ptr <srcarglist>)
+      declare void @llvm.va_copy.p0(ptr <destarglist>, ptr <srcarglist>)
+      declare void @llvm.va_copy.p5(ptr addrspace(5) <destarglist>, ptr addrspace(5) <srcarglist>)
 
 Overview:
 """""""""
@@ -12942,6 +12946,7 @@ Arguments:
 
 The first argument is a pointer to a ``va_list`` element to initialize.
 The second argument is a pointer to a ``va_list`` element to copy from.
+The address spaces of the two arguments must match.
 
 Semantics:
 """"""""""
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index d0ef9c25f39ea8..764902426f0b82 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -700,10 +700,13 @@ class MSBuiltin<string name> {
 //===--------------- Variable Argument Handling Intrinsics ----------------===//
 //
 
-def int_vastart : DefaultAttrsIntrinsic<[], [llvm_ptr_ty], [], "llvm.va_start">;
-def int_vacopy  : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [],
-                            "llvm.va_copy">;
-def int_vaend   : DefaultAttrsIntrinsic<[], [llvm_ptr_ty], [], "llvm.va_end">;
+def int_vastart : DefaultAttrsIntrinsic<[],
+                                        [llvm_anyptr_ty], [], "llvm.va_start">;
+def int_vacopy  : DefaultAttrsIntrinsic<[],
+                                        [llvm_anyptr_ty, LLVMMatchType<0>], [],
+                                        "llvm.va_copy">;
+def int_vaend   : DefaultAttrsIntrinsic<[],
+                                        [llvm_anyptr_ty], [], "llvm.va_end">;
 
 //===------------------- Garbage Collection Intrinsics --------------------===//
 //
diff --git a/llvm/test/Bitcode/compatibility-3.6.ll b/llvm/test/Bitcode/compatibility-3.6.ll
index b1f4abf7b8c554..2190e2fbccf288 100644
--- a/llvm/test/Bitcode/compatibility-3.6.ll
+++ b/llvm/test/Bitcode/compatibility-3.6.ll
@@ -1061,16 +1061,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1178,11 +1178,11 @@ define void @intrinsics.codegen() {
 ; CHECK: attributes #27 = { uwtable }
 ; CHECK: attributes #28 = { "cpu"="cortex-a8" }
 ; CHECK: attributes #29 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #30 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #31 = { nounwind memory(argmem: read) }
-; CHECK: attributes #32 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #33 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #34 = { nocallback nounwind }
+; CHECK: attributes #30 = { nounwind memory(argmem: read) }
+; CHECK: attributes #31 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #32 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #33 = { nocallback nounwind }
+; CHECK: attributes #34 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #36 = { builtin }
 
diff --git a/llvm/test/Bitcode/compatibility-3.7.ll b/llvm/test/Bitcode/compatibility-3.7.ll
index 91e55f6eda59f9..7e59b5c1be6e2f 100644
--- a/llvm/test/Bitcode/compatibility-3.7.ll
+++ b/llvm/test/Bitcode/compatibility-3.7.ll
@@ -1092,16 +1092,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1241,11 +1241,11 @@ define void @misc.metadata() {
 ; CHECK: attributes #30 = { uwtable }
 ; CHECK: attributes #31 = { "cpu"="cortex-a8" }
 ; CHECK: attributes #32 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #33 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #34 = { nounwind memory(argmem: read) }
-; CHECK: attributes #35 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #37 = { nocallback nounwind }
+; CHECK: attributes #33 = { nounwind memory(argmem: read) }
+; CHECK: attributes #34 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #36 = { nocallback nounwind }
+; CHECK: attributes #37 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #39 = { builtin }
 
diff --git a/llvm/test/Bitcode/compatibility-3.8.ll b/llvm/test/Bitcode/compatibility-3.8.ll
index aa4d8b14968c6e..ebd1f2fff8c94c 100644
--- a/llvm/test/Bitcode/compatibility-3.8.ll
+++ b/llvm/test/Bitcode/compatibility-3.8.ll
@@ -1247,16 +1247,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1551,11 +1551,11 @@ normal:
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #42 = { builtin }
 
diff --git a/llvm/test/Bitcode/compatibility-3.9.ll b/llvm/test/Bitcode/compatibility-3.9.ll
index e3c84f6e600714..c34f04ceb0de39 100644
--- a/llvm/test/Bitcode/compatibility-3.9.ll
+++ b/llvm/test/Bitcode/compatibility-3.9.ll
@@ -1318,16 +1318,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1624,11 +1624,11 @@ declare void @f.writeonly() writeonly
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { memory(write) }
 ; CHECK: attributes #42 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #43 = { builtin }
diff --git a/llvm/test/Bitcode/compatibility-4.0.ll b/llvm/test/Bitcode/compatibility-4.0.ll
index 06cb842059a4fa..05bffda1d117a3 100644
--- a/llvm/test/Bitcode/compatibility-4.0.ll
+++ b/llvm/test/Bitcode/compatibility-4.0.ll
@@ -1318,16 +1318,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1649,11 +1649,11 @@ define i8** @constexpr() {
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { memory(write) }
 ; CHECK: attributes #42 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #43 = { builtin }
diff --git a/llvm/test/Bitcode/compatibility-5.0.ll b/llvm/test/Bitcode/compatibility-5.0.ll
index f9ae558917cddc..0c872289c62ba8 100644
--- a/llvm/test/Bitcode/compatibility-5.0.ll
+++ b/llvm/test/Bitcode/compatibility-5.0.ll
@@ -1330,16 +1330,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1664,11 +1664,11 @@ define i8** @constexpr() {
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { memory(write) }
 ; CHECK: attributes #42 = { speculatable }
 ; CHECK: attributes #43 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
diff --git a/llvm/test/Bitcode/compatibility-6.0.ll b/llvm/test/Bitcode/compatibility-6.0.ll
index 1458e1b639ad44..44c680885be34f 100644
--- a/llvm/test/Bitcode/compatibility-6.0.ll
+++ b/llvm/test/Bitcode/compatibility-6.0.ll
@@ -1340,16 +1340,16 @@ define void @instructions.va_arg(i8* %v, ...) {
   %ap2 = bitcast i8** %ap to i8*
 
   call void @llvm.va_start(i8* %ap2)
-  ; CHECK: call void @llvm.va_start(ptr %ap2)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap2)
 
   va_arg i8* %ap2, i32
   ; CHECK: va_arg ptr %ap2, i32
 
   call void @llvm.va_copy(i8* %v, i8* %ap2)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap2)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap2)
 
   call void @llvm.va_end(i8* %ap2)
-  ; CHECK: call void @llvm.va_end(ptr %ap2)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap2)
 
   ret void
 }
@@ -1674,11 +1674,11 @@ define i8** @constexpr() {
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #41 = { memory(write) }
 ; CHECK: attributes #42 = { speculatable }
 ; CHECK: attributes #43 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index fa8b0520567a6f..b374924516d665 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1648,16 +1648,16 @@ define void @instructions.va_arg(ptr %v, ...) {
   %ap = alloca ptr
 
   call void @llvm.va_start(ptr %ap)
-  ; CHECK: call void @llvm.va_start(ptr %ap)
+  ; CHECK: call void @llvm.va_start.p0(ptr %ap)
 
   va_arg ptr %ap, i32
   ; CHECK: va_arg ptr %ap, i32
 
   call void @llvm.va_copy(ptr %v, ptr %ap)
-  ; CHECK: call void @llvm.va_copy(ptr %v, ptr %ap)
+  ; CHECK: call void @llvm.va_copy.p0(ptr %v, ptr %ap)
 
   call void @llvm.va_end(ptr %ap)
-  ; CHECK: call void @llvm.va_end(ptr %ap)
+  ; CHECK: call void @llvm.va_end.p0(ptr %ap)
 
   ret void
 }
@@ -2091,12 +2091,12 @@ define float @nofpclass_callsites(float %arg) {
 ; CHECK: attributes #33 = { memory(inaccessiblemem: readwrite) }
 ; CHECK: attributes #34 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
 ; CHECK: attributes #35 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #36 = { nocallback nofree nosync nounwind willreturn }
-; CHECK: attributes #37 = { nounwind memory(argmem: read) }
-; CHECK: attributes #38 = { nounwind memory(argmem: readwrite) }
-; CHECK: attributes #39 = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CHECK: attributes #40 = { nocallback nounwind }
-; CHECK: attributes #41 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
+; CHECK: attributes #36 = { nounwind memory(argmem: read) }
+; CHECK: attributes #37 = { nounwind memory(argmem: readwrite) }
+; CHECK: attributes #38 = { nocallback nofree nosync nounwind willreturn memory(read) }
+; CHECK: attributes #39 = { nocallback nounwind }
+; CHECK: attributes #40 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
+; CHECK: attributes #41 = { nocallback nofree nosync nounwind willreturn }
 ; CHECK: attributes #42 = { memory(write) }
 ; CHECK: attributes #43 = { speculatable }
 ; CHECK: attributes #44 = { strictfp }
diff --git a/llvm/test/Bitcode/thinlto-function-summary.ll b/llvm/test/Bitcode/thinlto-function-summary.ll
index 799759ebcac1ad..13c6611843d651 100644
--- a/llvm/test/Bitcode/thinlto-function-summary.ll
+++ b/llvm/test/Bitcode/thinlto-function-summary.ll
@@ -13,9 +13,9 @@
 ; "variadic"
 ; BC-NEXT: <FUNCTION op0=46 op1=8
 ; "llvm.va_start"
-; BC-NEXT: <FUNCTION op0=54 op1=13
+; BC-NEXT: <FUNCTION op0=54 op1=16
 ; "f"
-; BC-NEXT: <ALIAS op0=67 op1=1
+; BC-NEXT: <ALIAS op0=70 op1=1
 ; BC: <GLOBALVAL_SUMMARY_BLOCK
 ; BC-NEXT: <VERSION
 ; BC-NEXT: <FLAGS
@@ -26,7 +26,7 @@
 ; BC-NEXT: <ALIAS {{.*}} op0=6 op1=0 op2=3
 ; BC-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 ; BC: <STRTAB_BLOCK
-; BC-NEXT: blob data = 'hfoobaranon.{{................................}}.0variadicllvm.va_startf{{.*}}'
+; BC-NEXT: blob data = 'hfoobaranon.{{................................}}.0variadicllvm.va_start.p{{[0-9]+}}f{{.*}}'
 
 
 ; RUN: opt -passes=name-anon-globals -module-summary < %s | llvm-dis | FileCheck %s
diff --git a/llvm/test/Bitcode/variableArgumentIntrinsic.3.2.ll b/llvm/test/Bitcode/variableArgumentIntrinsic.3.2.ll
index fad7b8ea6a58b9..fd3f500de7918c 100644
--- a/llvm/test/Bitcode/variableArgumentIntrinsic.3.2.ll
+++ b/llvm/test/Bitcode/variableArgumentIntrinsic.3.2.ll
@@ -10,7 +10,7 @@ define i32 @varArgIntrinsic(i32 %X, ...) {
   %ap = alloca i8*
   %ap2 = bitcast i8** %ap to i8*
 
-; CHECK: call void @llvm.va_start(ptr %ap2)
+; CHECK: call void @llvm.va_start.p0(ptr %ap2)
   call void @llvm.va_start(i8* %ap2)
 
 ; CHECK-NEXT: %tmp = va_arg ptr %ap, i32
@@ -19,12 +19,12 @@ define i32 @varArgIntrinsic(i32 %X, ...) {
   %aq = alloca i8*
   %aq2 = bitcast i8** %aq to i8*
 
-; CHECK: call void @llvm.va_copy(ptr %aq2, ptr %ap2)
+; CHECK: call void @llvm.va_copy.p0(ptr %aq2, ptr %ap2)
   call void @llvm.va_copy(i8* %aq2, i8* %ap2)
-; CHECK-NEXT: call void @llvm.va_end(ptr %aq2)
+; CHECK-NEXT: call void @llvm.va_end.p0(ptr %aq2)
   call void @llvm.va_end(i8* %aq2)
 
-; CHECK-NEXT:  call void @llvm.va_end(ptr %ap2)
+; CHECK-NEXT:  call void @llvm.va_end.p0(ptr %ap2)
   call void @llvm.va_end(i8* %ap2)
   ret i32 %tmp
 }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
index 96ac4b6088c31c..9133b329deb263 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
@@ -758,7 +758,7 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef %t, i32 noundef
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -808,7 +808,7 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef %t, i32 noundef
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -851,7 +851,7 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -901,7 +901,7 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -936,7 +936,7 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -986,7 +986,7 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1021,7 +1021,7 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1071,7 +1071,7 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1106,7 +1106,7 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(fp128 noundef %t, i32 nound
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1156,7 +1156,7 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(fp128 noundef %t, i32 nound
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1191,7 +1191,7 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1241,7 +1241,7 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1276,7 +1276,7 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz([2 x i64] %t.coe
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1326,7 +1326,7 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz([2 x i64] %t.coe
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1361,7 +1361,7 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz([2 x double] a
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1411,7 +1411,7 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz([2 x double] a
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1446,7 +1446,7 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz([4 x double] alignst
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1496,7 +1496,7 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz([4 x double] alignst
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1531,7 +1531,7 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz([2 x i64] %t.co
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1581,7 +1581,7 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz([2 x i64] %t.co
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1616,7 +1616,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz([2 x fp128] ali
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1666,7 +1666,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz([2 x fp128] ali
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1701,7 +1701,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz([4 x fp128] ali
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1751,7 +1751,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz([4 x fp128] ali
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP52]], ptr align 16 [[TMP53]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/SystemZ/vararg-kernel.ll b/llvm/test/Instrumentation/MemorySanitizer/SystemZ/vararg-kernel.ll
index 1535fccfc21107..e0b5907719afcb 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/SystemZ/vararg-kernel.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/SystemZ/vararg-kernel.ll
@@ -39,7 +39,7 @@ define i64 @foo(i64 %guard, ...) #1 {
 ; Only 56 bytes of the register save area is copied, because of
 ; "use-soft-float".
 
-; CHECK: call void @llvm.va_start(ptr %vl)
+; CHECK: call void @llvm.va_start.p0(ptr %vl)
 ; CHECK: [[VlAddr:%.*]] = ptrtoint ptr %vl to i64
 ; CHECK: [[RegSaveAreaAddrAddr:%.*]] = add i64 [[VlAddr]], 24
 ; CHECK: [[RegSaveAreaAddr:%.*]] = inttoptr i64 [[RegSaveAreaAddrAddr]] to ptr
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
index aff4d2c55ad6fc..205101564dfe09 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
@@ -560,7 +560,7 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef signext %t, i32
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -580,7 +580,7 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef signext %t, i32
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -623,7 +623,7 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -643,7 +643,7 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -678,7 +678,7 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -698,7 +698,7 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -733,7 +733,7 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -753,7 +753,7 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -788,7 +788,7 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(x86_fp80 noundef %t, i32 no
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -808,7 +808,7 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(x86_fp80 noundef %t, i32 no
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -843,7 +843,7 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -863,7 +863,7 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -898,7 +898,7 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz(i64 %t.coerce0,
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -918,7 +918,7 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz(i64 %t.coerce0,
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -953,7 +953,7 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz(double %t.coer
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -973,7 +973,7 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz(double %t.coer
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1008,7 +1008,7 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz(ptr noundef byval(%s
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1028,7 +1028,7 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz(ptr noundef byval(%s
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1063,7 +1063,7 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz(double %t.coerc
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1083,7 +1083,7 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz(double %t.coerc
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1118,7 +1118,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1138,7 +1138,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
@@ -1173,7 +1173,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
-; CHECK-NEXT:    call void @llvm.va_start(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
@@ -1193,7 +1193,7 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP24]], i64 [[TMP0]], i1 false)
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @_Z3usePv(ptr noundef nonnull [[ARGS]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[ARGS]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
index 21f3311a57efa6..f07f3ad06e6077 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
@@ -542,7 +542,7 @@ define void @VAStart(i32 %x, ...) sanitize_memory {
 ; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[TMP27]], 17592186044416, !dbg [[DBG11]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr, !dbg [[DBG11]]
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 24, i1 false), !dbg [[DBG11]]
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[VA]]), !dbg [[DBG11]]
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]]), !dbg [[DBG11]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[VA]] to i64, !dbg [[DBG11]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], 16, !dbg [[DBG11]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr, !dbg [[DBG11]]
diff --git a/llvm/test/Transforms/GlobalOpt/inalloca-varargs.ll b/llvm/test/Transforms/GlobalOpt/inalloca-varargs.ll
index 188210782edd98..4c5a448d12c482 100644
--- a/llvm/test/Transforms/GlobalOpt/inalloca-varargs.ll
+++ b/llvm/test/Transforms/GlobalOpt/inalloca-varargs.ll
@@ -23,7 +23,7 @@ define internal i32 @i(ptr inalloca(ptr) %a, ...) {
 ; CHECK-LABEL: define {{[^@]+}}@i
 ; CHECK-SAME: (ptr inalloca(ptr) [[A:%.*]], ...) unnamed_addr {
 ; CHECK-NEXT:    [[AP:%.*]] = alloca ptr, align 4
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[ARGP_CUR:%.*]] = load ptr, ptr [[AP]], align 4
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[ARGP_CUR]], align 4
 ; CHECK-NEXT:    ret i32 [[L]]
diff --git a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
index ef365d6eaddb5b..38dfd25e039e6d 100644
--- a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
@@ -17,10 +17,10 @@ define i32 @func1(i32 %a, double %b, ptr %v, ...) nounwind {
 ; CHECK-NEXT:    [[AP:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[A:%.*]], ptr [[A_ADDR]], double [[B:%.*]], ptr [[B_ADDR]])
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr [[AP]], i32
-; CHECK-NEXT:    call void @llvm.va_copy(ptr [[V:%.*]], ptr [[AP]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_copy.p0(ptr [[V:%.*]], ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[AP]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP_LOC]])
 ; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[TMP0]], ptr [[C]], ptr [[TMP_LOC]])
 ; CHECK-NEXT:    [[TMP_RELOAD:%.*]] = load i32, ptr [[TMP_LOC]], align 4
@@ -52,10 +52,10 @@ define i32 @func2(i32 %a, double %b, ptr %v, ...) nounwind {
 ; CHECK-NEXT:    [[AP:%.*]] = alloca ptr, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[A:%.*]], ptr [[A_ADDR]], double [[B:%.*]], ptr [[B_ADDR]])
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr [[AP]], i32
-; CHECK-NEXT:    call void @llvm.va_copy(ptr [[V:%.*]], ptr [[AP]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_copy.p0(ptr [[V:%.*]], ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[AP]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP_LOC]])
 ; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[TMP0]], ptr [[C]], ptr [[TMP_LOC]])
 ; CHECK-NEXT:    [[TMP_RELOAD:%.*]] = load i32, ptr [[TMP_LOC]], align 4
diff --git a/llvm/test/Transforms/IROutliner/outline-vaarg-intrinsic.ll b/llvm/test/Transforms/IROutliner/outline-vaarg-intrinsic.ll
index 9f565de960575b..2d526086fae49c 100644
--- a/llvm/test/Transforms/IROutliner/outline-vaarg-intrinsic.ll
+++ b/llvm/test/Transforms/IROutliner/outline-vaarg-intrinsic.ll
@@ -51,7 +51,7 @@ entry:
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 ; CHECK-NEXT:    store double [[B]], ptr [[B_ADDR]], align 8
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr [[AP]], i32
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP_LOC]])
 ; CHECK-NEXT:    call void @outlined_ir_func_0(ptr [[V]], ptr [[AP]], i32 [[TMP0]], ptr [[C]], ptr [[TMP_LOC]])
@@ -70,7 +70,7 @@ entry:
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 ; CHECK-NEXT:    store double [[B]], ptr [[B_ADDR]], align 8
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[AP]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[AP]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg ptr [[AP]], i32
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP_LOC]])
 ; CHECK-NEXT:    call void @outlined_ir_func_0(ptr [[V]], ptr [[AP]], i32 [[TMP0]], ptr [[C]], ptr [[TMP_LOC]])
@@ -84,8 +84,8 @@ entry:
 ; CHECK-NEXT:  newFuncRoot:
 ; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
 ; CHECK:       entry_to_outline:
-; CHECK-NEXT:    call void @llvm.va_copy(ptr [[TMP0]], ptr [[TMP1]])
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[TMP1]])
+; CHECK-NEXT:    call void @llvm.va_copy.p0(ptr [[TMP0]], ptr [[TMP1]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[TMP1]])
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    br label [[ENTRY_AFTER_OUTLINE_EXITSTUB:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll
index 0e7461c2612b9f..82e9a2ab286ee5 100644
--- a/llvm/test/Transforms/NewGVN/pr31483.ll
+++ b/llvm/test/Transforms/NewGVN/pr31483.ll
@@ -41,7 +41,7 @@ define signext i32 @ham(ptr %arg, ptr %arg1) #0 {
 ; CHECK:       bb22:
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb23:
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[TMP]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[TMP]])
 ; CHECK-NEXT:    ret i32 undef
 ;
 bb:
diff --git a/llvm/test/Transforms/Reassociate/vaarg_movable.ll b/llvm/test/Transforms/Reassociate/vaarg_movable.ll
index 337877a54a9071..4e45b219fccd59 100644
--- a/llvm/test/Transforms/Reassociate/vaarg_movable.ll
+++ b/llvm/test/Transforms/Reassociate/vaarg_movable.ll
@@ -10,13 +10,13 @@ define i32 @func(i32 %dummy, ...) {
 ;
 ; CHECK-LABEL: @func(
 ; CHECK-NEXT:    [[VARARGS:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    call void @llvm.va_start(ptr [[VARARGS]])
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VARARGS]])
 ; CHECK-NEXT:    [[V0:%.*]] = va_arg ptr [[VARARGS]], i32
 ; CHECK-NEXT:    [[V1:%.*]] = va_arg ptr [[VARARGS]], i32
 ; CHECK-NEXT:    [[V0_NEG:%.*]] = sub i32 0, [[V0]]
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[V0_NEG]], 1
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUB]], [[V1]]
-; CHECK-NEXT:    call void @llvm.va_end(ptr [[VARARGS]])
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VARARGS]])
 ; CHECK-NEXT:    ret i32 [[ADD]]
 ;
   %varargs = alloca ptr, align 8
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index f4bac9376f2ea0..28526f1a1560ce 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -611,19 +611,19 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> {
 // Variadic function intrinsics.
 //
 
-def LLVM_VaStartOp : LLVM_ZeroResultIntrOp<"vastart">,
+def LLVM_VaStartOp : LLVM_ZeroResultIntrOp<"vastart", [0]>,
                      Arguments<(ins LLVM_AnyPointer:$arg_list)> {
   let assemblyFormat = "$arg_list attr-dict `:` qualified(type($arg_list))";
   let summary = "Initializes `arg_list` for subsequent variadic argument extractions.";
 }
 
-def LLVM_VaCopyOp : LLVM_ZeroResultIntrOp<"vacopy">,
+def LLVM_VaCopyOp : LLVM_ZeroResultIntrOp<"vacopy", [0]>,
                     Arguments<(ins LLVM_AnyPointer:$dest_list, LLVM_AnyPointer:$src_list)> {
   let assemblyFormat = "$src_list `to` $dest_list attr-dict `:` type(operands)";
   let summary = "Copies the current argument position from `src_list` to `dest_list`.";
 }
 
-def LLVM_VaEndOp : LLVM_ZeroResultIntrOp<"vaend">,
+def LLVM_VaEndOp : LLVM_ZeroResultIntrOp<"vaend", [0]>,
                    Arguments<(ins LLVM_AnyPointer:$arg_list)> {
   let assemblyFormat = "$arg_list attr-dict `:` qualified(type($arg_list))";
   let summary = "Destroys `arg_list`, which has been initialized by `intr.vastart` or `intr.vacopy`.";
diff --git a/mlir/test/Target/LLVMIR/Import/basic.ll b/mlir/test/Target/LLVMIR/Import/basic.ll
index a059425d978067..448b0ebe25746c 100644
--- a/mlir/test/Target/LLVMIR/Import/basic.ll
+++ b/mlir/test/Target/LLVMIR/Import/basic.ll
@@ -72,26 +72,26 @@ define i32 @useFreezeOp(i32 %x) {
 ; Varadic function definition
 %struct.va_list = type { ptr }
 
-declare void @llvm.va_start(ptr)
-declare void @llvm.va_copy(ptr, ptr)
-declare void @llvm.va_end(ptr)
+declare void @llvm.va_start.p0(ptr)
+declare void @llvm.va_copy.p0(ptr, ptr)
+declare void @llvm.va_end.p0(ptr)
 
 ; CHECK-LABEL: llvm.func @variadic_function
 define void @variadic_function(i32 %X, ...) {
   ; CHECK: %[[ALLOCA0:.+]] = llvm.alloca %{{.*}} x !llvm.struct<"struct.va_list", (ptr)> {{.*}} : (i32) -> !llvm.ptr
   %ap = alloca %struct.va_list
   ; CHECK: llvm.intr.vastart %[[ALLOCA0]]
-  call void @llvm.va_start(ptr %ap)
+  call void @llvm.va_start.p0(ptr %ap)
 
   ; CHECK: %[[ALLOCA1:.+]] = llvm.alloca %{{.*}} x !llvm.ptr {{.*}} : (i32) -> !llvm.ptr
   %aq = alloca ptr
   ; CHECK: llvm.intr.vacopy %[[ALLOCA0]] to %[[ALLOCA1]]
-  call void @llvm.va_copy(ptr %aq, ptr %ap)
+  call void @llvm.va_copy.p0(ptr %aq, ptr %ap)
   ; CHECK: llvm.intr.vaend %[[ALLOCA1]]
-  call void @llvm.va_end(ptr %aq)
+  call void @llvm.va_end.p0(ptr %aq)
 
   ; CHECK: llvm.intr.vaend %[[ALLOCA0]]
-  call void @llvm.va_end(ptr %ap)
+  call void @llvm.va_end.p0(ptr %ap)
   ; CHECK: llvm.return
   ret void
 }
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 85561839f31a70..0cefb4f8983aa6 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -599,11 +599,11 @@ define void @ushl_sat_test(i32 %0, i32 %1, <8 x i32> %2, <8 x i32> %3) {
 ; CHECK-LABEL: llvm.func @va_intrinsics_test
 define void @va_intrinsics_test(ptr %0, ptr %1) {
 ; CHECK: llvm.intr.vastart %{{.*}}
-  call void @llvm.va_start(ptr %0)
+  call void @llvm.va_start.p0(ptr %0)
 ; CHECK: llvm.intr.vacopy %{{.*}} to %{{.*}}
-  call void @llvm.va_copy(ptr %1, ptr %0)
+  call void @llvm.va_copy.p0(ptr %1, ptr %0)
 ; CHECK: llvm.intr.vaend %{{.*}}
-  call void @llvm.va_end(ptr %0)
+  call void @llvm.va_end.p0(ptr %0)
   ret void
 }
 
@@ -1076,9 +1076,9 @@ declare ptr @llvm.stacksave.p0()
 declare ptr addrspace(1) @llvm.stacksave.p1()
 declare void @llvm.stackrestore.p0(ptr)
 declare void @llvm.stackrestore.p1(ptr addrspace(1))
-declare void @llvm.va_start(ptr)
-declare void @llvm.va_copy(ptr, ptr)
-declare void @llvm.va_end(ptr)
+declare void @llvm.va_start.p0(ptr)
+declare void @llvm.va_copy.p0(ptr, ptr)
+declare void @llvm.va_end.p0(ptr)
 declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index c38c7ea587d25b..97f37939551d83 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -2251,14 +2251,14 @@ llvm.func @vararg_function(%arg0: i32, ...) {
   %1 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA0:.+]] = alloca %struct.va_list, align 8
   %2 = llvm.alloca %1 x !llvm.struct<"struct.va_list", (ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  // CHECK: call void @llvm.va_start(ptr %[[ALLOCA0]])
+  // CHECK: call void @llvm.va_start.p0(ptr %[[ALLOCA0]])
   llvm.intr.vastart %2 : !llvm.ptr
   // CHECK: %[[ALLOCA1:.+]] = alloca ptr, align 8
   %4 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  // CHECK: call void @llvm.va_copy(ptr %[[ALLOCA1]], ptr %[[ALLOCA0]])
+  // CHECK: call void @llvm.va_copy.p0(ptr %[[ALLOCA1]], ptr %[[ALLOCA0]])
   llvm.intr.vacopy %2 to %4 : !llvm.ptr, !llvm.ptr
-  // CHECK: call void @llvm.va_end(ptr %[[ALLOCA1]])
-  // CHECK: call void @llvm.va_end(ptr %[[ALLOCA0]])
+  // CHECK: call void @llvm.va_end.p0(ptr %[[ALLOCA1]])
+  // CHECK: call void @llvm.va_end.p0(ptr %[[ALLOCA0]])
   llvm.intr.vaend %4 : !llvm.ptr
   llvm.intr.vaend %2 : !llvm.ptr
   // CHECK: ret void

>From ef316da4a2c5954a02c92707b5cb621402b76910 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 27 Mar 2024 00:56:47 +0530
Subject: [PATCH 11/54] AMDGPU: Fix dead check prefixes in test

---
 .../AMDGPU/global_atomics_i32_system.ll       | 564 ------------------
 .../AMDGPU/global_atomics_i64_system.ll       | 564 ------------------
 2 files changed, 1128 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 76ec1cc84f55b2..99d02ffaa523d5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -358,65 +358,6 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1)
 ; ---------------------------------------------------------------------
 
 define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_load_dword v3, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_load_dword v3, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v3, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -450,69 +391,6 @@ define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
 }
 
 define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret_offset:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_f32_e32 v0, vcc, 16, v0
-; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT:    global_load_dword v3, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret_offset:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT:    global_load_dword v3, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret_offset:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -549,71 +427,6 @@ define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %i
 }
 
 define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_load_dword v4, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v0, v4
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_load_dword v4, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v0, v4
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v4, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v0, v4
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -648,73 +461,6 @@ define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
 }
 
 define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret_offset:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_f32_e32 v4, vcc, 16, v0
-; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT:    global_load_dword v0, v[4:5]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v0
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret_offset:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT:    global_load_dword v0, v[4:5]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v0
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret_offset:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v0, v4
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -752,80 +498,6 @@ define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in
 }
 
 define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v0, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    global_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[34:35], 0
-; GCN1-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    global_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[34:35], 0
-; GCN2-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v1, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -876,84 +548,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr
 }
 
 define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    s_add_u32 s34, s4, 16
-; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    global_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
-; GCN1-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s34, s4, 16
-; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    global_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
-; GCN2-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1007,83 +601,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace
 }
 
 define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v0, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    global_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[34:35], 0
-; GCN1-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s6
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s5
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    global_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[34:35], 0
-; GCN2-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s6
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s5
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v0, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s6
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1134,87 +651,6 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inre
 }
 
 define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    s_add_u32 s34, s4, 16
-; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    global_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
-; GCN1-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s6
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s35
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s34, s4, 16
-; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    global_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
-; GCN2-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s6
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s35
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s6
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index d137f471910dc6..380ce7f3b93988 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -372,65 +372,6 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1)
 ; ---------------------------------------------------------------------
 
 define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_load_dword v3, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_load_dword v3, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v3, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB0_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB0_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,69 +405,6 @@ define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
 }
 
 define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret_offset:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_f64_e32 v0, vcc, 16, v0
-; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT:    global_load_dword v3, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret_offset:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT:    global_load_dword v3, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret_offset:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB1_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -563,71 +441,6 @@ define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %
 }
 
 define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_load_dword v4, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v4
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    v_mov_b32_e32 v0, v4
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_load_dword v4, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v4
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    v_mov_b32_e32 v0, v4
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v4, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB2_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB2_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v0, v4
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -663,73 +476,6 @@ define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
 }
 
 define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret_offset:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_add_f64_e32 v4, vcc, 16, v0
-; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT:    global_load_dword v0, v[4:5]
-; GCN1-NEXT:    s_mov_b64 s[4:5], 0
-; GCN1-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v3, v0
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret_offset:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT:    global_load_dword v0, v[4:5]
-; GCN2-NEXT:    s_mov_b64 s[4:5], 0
-; GCN2-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v3, v0
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret_offset:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[4:5], 0
-; GCN3-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v3, v4
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    s_cbranch_execnz .LBB3_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT:    v_mov_b32_e32 v0, v4
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret_offset:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -768,80 +514,6 @@ define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %
 }
 
 define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v0, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    global_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[34:35], 0
-; GCN1-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s4
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    v_mov_b32_e32 v3, s5
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    global_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[34:35], 0
-; GCN2-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s4
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    v_mov_b32_e32 v3, s5
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v1, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB4_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -896,84 +568,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr
 }
 
 define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    s_add_u32 s34, s4, 16
-; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    global_load_dword v1, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
-; GCN1-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v2, s34
-; GCN1-NEXT:    v_mov_b32_e32 v0, s6
-; GCN1-NEXT:    v_mov_b32_e32 v3, s35
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    v_mov_b32_e32 v1, v0
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s34, s4, 16
-; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    global_load_dword v1, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
-; GCN2-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v2, s34
-; GCN2-NEXT:    v_mov_b32_e32 v0, s6
-; GCN2-NEXT:    v_mov_b32_e32 v3, s35
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    v_mov_b32_e32 v1, v0
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v2, s4
-; GCN3-NEXT:    v_mov_b32_e32 v0, s6
-; GCN3-NEXT:    v_mov_b32_e32 v3, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    v_mov_b32_e32 v1, v0
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB5_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1029,83 +623,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
 }
 
 define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v0, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s5
-; GCN1-NEXT:    global_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[34:35], 0
-; GCN1-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s4
-; GCN1-NEXT:    v_mov_b32_e32 v1, s6
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s5
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v0, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s5
-; GCN2-NEXT:    global_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[34:35], 0
-; GCN2-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s4
-; GCN2-NEXT:    v_mov_b32_e32 v1, s6
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s5
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v0, v[0:1]
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s6
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB6_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1160,87 +677,6 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inr
 }
 
 define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
-; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    s_add_u32 s34, s4, 16
-; GCN1-NEXT:    s_addc_u32 s35, s5, 0
-; GCN1-NEXT:    v_mov_b32_e32 v0, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s35
-; GCN1-NEXT:    global_load_dword v0, v[0:1]
-; GCN1-NEXT:    s_mov_b64 s[36:37], 0
-; GCN1-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT:    v_mov_b32_e32 v3, s34
-; GCN1-NEXT:    v_mov_b32_e32 v1, s6
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    v_mov_b32_e32 v2, v0
-; GCN1-NEXT:    v_mov_b32_e32 v4, s35
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
-; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    s_add_u32 s34, s4, 16
-; GCN2-NEXT:    s_addc_u32 s35, s5, 0
-; GCN2-NEXT:    v_mov_b32_e32 v0, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s35
-; GCN2-NEXT:    global_load_dword v0, v[0:1]
-; GCN2-NEXT:    s_mov_b64 s[36:37], 0
-; GCN2-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN2-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT:    v_mov_b32_e32 v3, s34
-; GCN2-NEXT:    v_mov_b32_e32 v1, s6
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    v_mov_b32_e32 v2, v0
-; GCN2-NEXT:    v_mov_b32_e32 v4, s35
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT:    s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT:    s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
-; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v0, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s5
-; GCN3-NEXT:    global_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT:    s_mov_b64 s[34:35], 0
-; GCN3-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GCN3-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT:    v_mov_b32_e32 v3, s4
-; GCN3-NEXT:    v_mov_b32_e32 v1, s6
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    v_mov_b32_e32 v2, v0
-; GCN3-NEXT:    v_mov_b32_e32 v4, s5
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc
-; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_execnz .LBB7_1
-; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

>From 1103a2a337e90d8c7cc417b89e43c7a33aaea21e Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <5994977+JanekvO at users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:59:56 +0000
Subject: [PATCH 12/54] Reland [AMDGPU] MCExpr-ify MC layer kernel descriptor
 (#86494)

Kernel descriptor attributes, with their respective emit and asm parse functionality, converted to MCExpr.

Relands #80855 with fixes
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  40 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h     |  11 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 194 ++++++---
 .../MCTargetDesc/AMDGPUMCKernelDescriptor.cpp |  98 +++++
 .../MCTargetDesc/AMDGPUMCKernelDescriptor.h   |  54 +++
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     | 404 +++++++++++-------
 .../MCTargetDesc/AMDGPUTargetStreamer.h       |  33 +-
 .../Target/AMDGPU/MCTargetDesc/CMakeLists.txt |   1 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  41 --
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   7 -
 llvm/test/MC/AMDGPU/hsa-amdgpu-exprs.s        |  27 ++
 llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s    | 281 ++++++++++++
 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s     | 190 ++++++++
 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s     | 186 ++++++++
 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s     | 184 ++++++++
 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s      | 168 ++++++++
 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s      | 171 ++++++++
 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s    | 148 +++++++
 llvm/test/MC/AMDGPU/hsa-tg-split.s            |  74 ++++
 19 files changed, 1999 insertions(+), 313 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.h
 create mode 100644 llvm/test/MC/AMDGPU/hsa-amdgpu-exprs.s
 create mode 100644 llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s
 create mode 100644 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
 create mode 100644 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
 create mode 100644 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s
 create mode 100644 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s
 create mode 100644 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s
 create mode 100644 llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s
 create mode 100644 llvm/test/MC/AMDGPU/hsa-tg-split.s

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 72e8b59e0a4096..052b231d62a3eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -22,6 +22,7 @@
 #include "AMDKernelCodeT.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "R600AsmPrinter.h"
 #include "SIMachineFunctionInfo.h"
@@ -428,38 +429,43 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
   return KernelCodeProperties;
 }
 
-amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
-    const MachineFunction &MF,
-    const SIProgramInfo &PI) const {
+MCKernelDescriptor
+AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
+                                            const SIProgramInfo &PI) const {
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const Function &F = MF.getFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  MCContext &Ctx = MF.getContext();
 
-  amdhsa::kernel_descriptor_t KernelDescriptor;
-  memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
+  MCKernelDescriptor KernelDescriptor;
 
   assert(isUInt<32>(PI.ScratchSize));
   assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
   assert(isUInt<32>(PI.getComputePGMRSrc2()));
 
-  KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
-  KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
+  KernelDescriptor.group_segment_fixed_size =
+      MCConstantExpr::create(PI.LDSSize, Ctx);
+  KernelDescriptor.private_segment_fixed_size =
+      MCConstantExpr::create(PI.ScratchSize, Ctx);
 
   Align MaxKernArgAlign;
-  KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
+  KernelDescriptor.kernarg_size = MCConstantExpr::create(
+      STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
 
-  KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM);
-  KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
-  KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+  KernelDescriptor.compute_pgm_rsrc1 =
+      MCConstantExpr::create(PI.getComputePGMRSrc1(STM), Ctx);
+  KernelDescriptor.compute_pgm_rsrc2 =
+      MCConstantExpr::create(PI.getComputePGMRSrc2(), Ctx);
+  KernelDescriptor.kernel_code_properties =
+      MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
 
   assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
-  if (STM.hasGFX90AInsts())
-    KernelDescriptor.compute_pgm_rsrc3 =
-      CurrentProgramInfo.ComputePGMRSrc3GFX90A;
+  KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create(
+      STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx);
 
-  if (AMDGPU::hasKernargPreload(STM))
-    KernelDescriptor.kernarg_preload =
-        static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
+  KernelDescriptor.kernarg_preload = MCConstantExpr::create(
+      AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
+      Ctx);
 
   return KernelDescriptor;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 79326cd3d3289f..b8b2718d293e69 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -28,15 +28,12 @@ class MCCodeEmitter;
 class MCOperand;
 
 namespace AMDGPU {
+struct MCKernelDescriptor;
 namespace HSAMD {
 class MetadataStreamer;
 }
 } // namespace AMDGPU
 
-namespace amdhsa {
-struct kernel_descriptor_t;
-}
-
 class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
   unsigned CodeObjectVersion;
@@ -75,9 +72,9 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
   uint16_t getAmdhsaKernelCodeProperties(
       const MachineFunction &MF) const;
 
-  amdhsa::kernel_descriptor_t getAmdhsaKernelDescriptor(
-      const MachineFunction &MF,
-      const SIProgramInfo &PI) const;
+  AMDGPU::MCKernelDescriptor
+  getAmdhsaKernelDescriptor(const MachineFunction &MF,
+                            const SIProgramInfo &PI) const;
 
   void initTargetStreamer(Module &M);
 
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 4648df199c741d..294fc683fe9216 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8,6 +8,7 @@
 
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCExpr.h"
+#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
@@ -5417,7 +5418,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   if (getParser().parseIdentifier(KernelName))
     return true;
 
-  kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI());
+  AMDGPU::MCKernelDescriptor KD =
+      AMDGPU::MCKernelDescriptor::getDefaultAmdhsaKernelDescriptor(
+          &getSTI(), getContext());
 
   StringSet<> Seen;
 
@@ -5457,89 +5460,111 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       return TokError(".amdhsa_ directives cannot be repeated");
 
     SMLoc ValStart = getLoc();
-    int64_t IVal;
-    if (getParser().parseAbsoluteExpression(IVal))
+    const MCExpr *ExprVal;
+    if (getParser().parseExpression(ExprVal))
       return true;
     SMLoc ValEnd = getLoc();
     SMRange ValRange = SMRange(ValStart, ValEnd);
 
-    if (IVal < 0)
-      return OutOfRangeError(ValRange);
-
+    int64_t IVal = 0;
     uint64_t Val = IVal;
+    bool EvaluatableExpr;
+    if ((EvaluatableExpr = ExprVal->evaluateAsAbsolute(IVal))) {
+      if (IVal < 0)
+        return OutOfRangeError(ValRange);
+      Val = IVal;
+    }
 
 #define PARSE_BITS_ENTRY(FIELD, ENTRY, VALUE, RANGE)                           \
-  if (!isUInt<ENTRY##_WIDTH>(VALUE))                                           \
+  if (!isUInt<ENTRY##_WIDTH>(Val))                                             \
     return OutOfRangeError(RANGE);                                             \
-  AMDHSA_BITS_SET(FIELD, ENTRY, VALUE);
+  AMDGPU::MCKernelDescriptor::bits_set(FIELD, VALUE, ENTRY##_SHIFT, ENTRY,     \
+                                       getContext());
+
+// Some fields use the parsed value immediately which requires the expression to
+// be solvable.
+#define EXPR_RESOLVE_OR_ERROR(RESOLVED)                                        \
+  if (!(RESOLVED))                                                             \
+    return Error(IDRange.Start, "directive should have resolvable expression", \
+                 IDRange);
 
     if (ID == ".amdhsa_group_segment_fixed_size") {
-      if (!isUInt<sizeof(KD.group_segment_fixed_size) * CHAR_BIT>(Val))
+      if (!isUInt<sizeof(kernel_descriptor_t::group_segment_fixed_size) *
+                  CHAR_BIT>(Val))
         return OutOfRangeError(ValRange);
-      KD.group_segment_fixed_size = Val;
+      KD.group_segment_fixed_size = ExprVal;
     } else if (ID == ".amdhsa_private_segment_fixed_size") {
-      if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val))
+      if (!isUInt<sizeof(kernel_descriptor_t::private_segment_fixed_size) *
+                  CHAR_BIT>(Val))
         return OutOfRangeError(ValRange);
-      KD.private_segment_fixed_size = Val;
+      KD.private_segment_fixed_size = ExprVal;
     } else if (ID == ".amdhsa_kernarg_size") {
-      if (!isUInt<sizeof(KD.kernarg_size) * CHAR_BIT>(Val))
+      if (!isUInt<sizeof(kernel_descriptor_t::kernarg_size) * CHAR_BIT>(Val))
         return OutOfRangeError(ValRange);
-      KD.kernarg_size = Val;
+      KD.kernarg_size = ExprVal;
     } else if (ID == ".amdhsa_user_sgpr_count") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       ExplicitUserSGPRCount = Val;
     } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (hasArchitectedFlatScratch())
         return Error(IDRange.Start,
                      "directive is not supported with architected flat scratch",
                      IDRange);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
-                       Val, ValRange);
+                       ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 4;
     } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_length") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (!hasKernargPreload())
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
 
       if (Val > getMaxNumUserSGPRs())
         return OutOfRangeError(ValRange);
-      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_LENGTH, Val,
+      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_LENGTH, ExprVal,
                        ValRange);
       if (Val) {
         ImpliedUserSGPRCount += Val;
         PreloadLength = Val;
       }
     } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_offset") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (!hasKernargPreload())
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
 
       if (Val >= 1024)
         return OutOfRangeError(ValRange);
-      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_OFFSET, Val,
+      PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_OFFSET, ExprVal,
                        ValRange);
       if (Val)
         PreloadOffset = Val;
     } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, ExprVal,
                        ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, ExprVal,
                        ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
-                       Val, ValRange);
+                       ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, ExprVal,
                        ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
@@ -5548,34 +5573,39 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
         return Error(IDRange.Start,
                      "directive is not supported with architected flat scratch",
                      IDRange);
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
-                       ValRange);
+                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT,
+                       ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
-                       Val, ValRange);
+                       ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 1;
     } else if (ID == ".amdhsa_wavefront_size32") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
       EnableWavefrontSize32 = Val;
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
-                       Val, ValRange);
+                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_uses_dynamic_stack") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
-                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, Val, ValRange);
+                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
       if (hasArchitectedFlatScratch())
         return Error(IDRange.Start,
                      "directive is not supported with architected flat scratch",
                      IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
+                       COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_enable_private_segment") {
       if (!hasArchitectedFlatScratch())
         return Error(
@@ -5583,42 +5613,48 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
             "directive is not supported without architected flat scratch",
             IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
+                       COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_system_sgpr_workgroup_id_y") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_system_sgpr_workgroup_id_z") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_system_sgpr_workgroup_info") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_system_vgpr_workitem_id") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
-                       COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, Val,
+                       COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_next_free_vgpr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       VGPRRange = ValRange;
       NextFreeVGPR = Val;
     } else if (ID == ".amdhsa_next_free_sgpr") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       SGPRRange = ValRange;
       NextFreeSGPR = Val;
     } else if (ID == ".amdhsa_accum_offset") {
       if (!isGFX90A())
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       AccumOffset = Val;
     } else if (ID == ".amdhsa_reserve_vcc") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (!isUInt<1>(Val))
         return OutOfRangeError(ValRange);
       ReserveVCC = Val;
     } else if (ID == ".amdhsa_reserve_flat_scratch") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (IVersion.Major < 7)
         return Error(IDRange.Start, "directive requires gfx7+", IDRange);
       if (hasArchitectedFlatScratch())
@@ -5638,97 +5674,105 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                                  IDRange);
     } else if (ID == ".amdhsa_float_round_mode_32") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange);
+                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_float_round_mode_16_64") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64, Val, ValRange);
+                       COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_float_denorm_mode_32") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32, Val, ValRange);
+                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32, ExprVal,
+                       ValRange);
     } else if (ID == ".amdhsa_float_denorm_mode_16_64") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val,
+                       COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_dx10_clamp") {
       if (IVersion.Major >= 12)
         return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, Val,
+                       COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_ieee_mode") {
       if (IVersion.Major >= 12)
         return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, Val,
+                       COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_fp16_overflow") {
       if (IVersion.Major < 9)
         return Error(IDRange.Start, "directive requires gfx9+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL, Val,
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_tg_split") {
       if (!isGFX90A())
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Val,
-                       ValRange);
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_workgroup_processor_mode") {
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, Val,
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_memory_ordered") {
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, Val,
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_forward_progress") {
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
-      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Val,
+      PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+                       COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_shared_vgpr_count") {
+      EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (IVersion.Major < 10 || IVersion.Major >= 12)
         return Error(IDRange.Start, "directive requires gfx10 or gfx11",
                      IDRange);
       SharedVGPRCount = Val;
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3,
-                       COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT, Val,
+                       COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT, ExprVal,
                        ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
       PARSE_BITS_ENTRY(
           KD.compute_pgm_rsrc2,
-          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, Val,
-          ValRange);
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION,
+          ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_denorm_src") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_div_zero") {
       PARSE_BITS_ENTRY(
           KD.compute_pgm_rsrc2,
-          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, Val,
-          ValRange);
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO,
+          ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_overflow") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_underflow") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_inexact") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_exception_int_div_zero") {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
                        COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
-                       Val, ValRange);
+                       ExprVal, ValRange);
     } else if (ID == ".amdhsa_round_robin_scheduling") {
       if (IVersion.Major < 12)
         return Error(IDRange.Start, "directive requires gfx12+", IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
-                       COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, Val,
+                       COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, ExprVal,
                        ValRange);
     } else {
       return Error(IDRange.Start, "unknown .amdhsa_kernel directive", IDRange);
@@ -5755,15 +5799,18 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>(
           VGPRBlocks))
     return OutOfRangeError(VGPRRange);
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                  COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, VGPRBlocks);
+  AMDGPU::MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc1, MCConstantExpr::create(VGPRBlocks, getContext()),
+      COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT,
+      COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, getContext());
 
   if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>(
           SGPRBlocks))
     return OutOfRangeError(SGPRRange);
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                  COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
-                  SGPRBlocks);
+  AMDGPU::MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc1, MCConstantExpr::create(SGPRBlocks, getContext()),
+      COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT,
+      COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, getContext());
 
   if (ExplicitUserSGPRCount && ImpliedUserSGPRCount > *ExplicitUserSGPRCount)
     return TokError("amdgpu_user_sgpr_count smaller than than implied by "
@@ -5774,11 +5821,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
 
   if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
     return TokError("too many user SGPRs enabled");
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
-                  UserSGPRCount);
-
-  if (PreloadLength && KD.kernarg_size &&
-      (PreloadLength * 4 + PreloadOffset * 4 > KD.kernarg_size))
+  AMDGPU::MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc2, MCConstantExpr::create(UserSGPRCount, getContext()),
+      COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_SHIFT,
+      COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, getContext());
+
+  int64_t IVal = 0;
+  if (!KD.kernarg_size->evaluateAsAbsolute(IVal))
+    return TokError("Kernarg size should be resolvable");
+  uint64_t kernarg_size = IVal;
+  if (PreloadLength && kernarg_size &&
+      (PreloadLength * 4 + PreloadOffset * 4 > kernarg_size))
     return TokError("Kernarg preload length + offset is larger than the "
                     "kernarg segment size");
 
@@ -5790,8 +5843,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                       "increments of 4");
     if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4))
       return TokError("accum_offset exceeds total VGPR allocation");
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
-                    (AccumOffset / 4 - 1));
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc3,
+        MCConstantExpr::create(AccumOffset / 4 - 1, getContext()),
+        COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+        COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, getContext());
   }
 
   if (IVersion.Major >= 10 && IVersion.Major < 12) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp
new file mode 100644
index 00000000000000..77e7e30ff5281b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp
@@ -0,0 +1,98 @@
+//===--- AMDHSAKernelDescriptor.h -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCKernelDescriptor.h"
+#include "AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/TargetParser/TargetParser.h"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+MCKernelDescriptor
+MCKernelDescriptor::getDefaultAmdhsaKernelDescriptor(const MCSubtargetInfo *STI,
+                                                     MCContext &Ctx) {
+  IsaVersion Version = getIsaVersion(STI->getCPU());
+
+  MCKernelDescriptor KD;
+  const MCExpr *ZeroMCExpr = MCConstantExpr::create(0, Ctx);
+  const MCExpr *OneMCExpr = MCConstantExpr::create(1, Ctx);
+
+  KD.group_segment_fixed_size = ZeroMCExpr;
+  KD.private_segment_fixed_size = ZeroMCExpr;
+  KD.compute_pgm_rsrc1 = ZeroMCExpr;
+  KD.compute_pgm_rsrc2 = ZeroMCExpr;
+  KD.compute_pgm_rsrc3 = ZeroMCExpr;
+  KD.kernarg_size = ZeroMCExpr;
+  KD.kernel_code_properties = ZeroMCExpr;
+  KD.kernarg_preload = ZeroMCExpr;
+
+  MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc1,
+      MCConstantExpr::create(amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE, Ctx),
+      amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Ctx);
+  if (Version.Major < 12) {
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc1, OneMCExpr,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, Ctx);
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc1, OneMCExpr,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, Ctx);
+  }
+  MCKernelDescriptor::bits_set(
+      KD.compute_pgm_rsrc2, OneMCExpr,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Ctx);
+  if (Version.Major >= 10) {
+    if (STI->getFeatureBits().test(FeatureWavefrontSize32))
+      MCKernelDescriptor::bits_set(
+          KD.kernel_code_properties, OneMCExpr,
+          amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
+          amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, Ctx);
+    if (!STI->getFeatureBits().test(FeatureCuMode))
+      MCKernelDescriptor::bits_set(
+          KD.compute_pgm_rsrc1, OneMCExpr,
+          amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
+          amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, Ctx);
+
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc1, OneMCExpr,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, Ctx);
+  }
+  if (AMDGPU::isGFX90A(*STI) && STI->getFeatureBits().test(FeatureTgSplit))
+    MCKernelDescriptor::bits_set(
+        KD.compute_pgm_rsrc3, OneMCExpr,
+        amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx);
+  return KD;
+}
+
+void MCKernelDescriptor::bits_set(const MCExpr *&Dst, const MCExpr *Value,
+                                  uint32_t Shift, uint32_t Mask,
+                                  MCContext &Ctx) {
+  auto Sft = MCConstantExpr::create(Shift, Ctx);
+  auto Msk = MCConstantExpr::create(Mask, Ctx);
+  Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+  Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Sft, Ctx),
+                               Ctx);
+}
+
+const MCExpr *MCKernelDescriptor::bits_get(const MCExpr *Src, uint32_t Shift,
+                                           uint32_t Mask, MCContext &Ctx) {
+  auto Sft = MCConstantExpr::create(Shift, Ctx);
+  auto Msk = MCConstantExpr::create(Mask, Ctx);
+  return MCBinaryExpr::createLShr(MCBinaryExpr::createAnd(Src, Msk, Ctx), Sft,
+                                  Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.h
new file mode 100644
index 00000000000000..26958ac8b9ee17
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.h
@@ -0,0 +1,54 @@
+//===--- AMDGPUMCKernelDescriptor.h ---------------------------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// AMDHSA kernel descriptor MCExpr struct for use in MC layer. Uses
+/// AMDHSAKernelDescriptor.h for sizes and constants.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELDESCRIPTOR_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELDESCRIPTOR_H
+
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
+
+namespace llvm {
+class MCExpr;
+class MCContext;
+class MCSubtargetInfo;
+namespace AMDGPU {
+
+struct MCKernelDescriptor {
+  const MCExpr *group_segment_fixed_size = nullptr;
+  const MCExpr *private_segment_fixed_size = nullptr;
+  const MCExpr *kernarg_size = nullptr;
+  const MCExpr *compute_pgm_rsrc3 = nullptr;
+  const MCExpr *compute_pgm_rsrc1 = nullptr;
+  const MCExpr *compute_pgm_rsrc2 = nullptr;
+  const MCExpr *kernel_code_properties = nullptr;
+  const MCExpr *kernarg_preload = nullptr;
+
+  static MCKernelDescriptor
+  getDefaultAmdhsaKernelDescriptor(const MCSubtargetInfo *STI, MCContext &Ctx);
+  // MCExpr for:
+  // Dst = Dst & ~Mask
+  // Dst = Dst | (Value << Shift)
+  static void bits_set(const MCExpr *&Dst, const MCExpr *Value, uint32_t Shift,
+                       uint32_t Mask, MCContext &Ctx);
+
+  // MCExpr for:
+  // return (Src & Mask) >> Shift
+  static const MCExpr *bits_get(const MCExpr *Src, uint32_t Shift,
+                                uint32_t Mask, MCContext &Ctx);
+};
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELDESCRIPTOR_H
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 4742b0b3e52ecf..3006fcdb928235 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetStreamer.h"
+#include "AMDGPUMCKernelDescriptor.h"
 #include "AMDGPUPTNote.h"
 #include "AMDKernelCodeT.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -307,94 +308,142 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
 
 void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     const MCSubtargetInfo &STI, StringRef KernelName,
-    const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
+    const MCKernelDescriptor &KD, uint64_t NextVGPR, uint64_t NextSGPR,
     bool ReserveVCC, bool ReserveFlatScr) {
   IsaVersion IVersion = getIsaVersion(STI.getCPU());
+  const MCAsmInfo *MAI = getContext().getAsmInfo();
 
   OS << "\t.amdhsa_kernel " << KernelName << '\n';
 
-#define PRINT_FIELD(STREAM, DIRECTIVE, KERNEL_DESC, MEMBER_NAME, FIELD_NAME)   \
-  STREAM << "\t\t" << DIRECTIVE << " "                                         \
-         << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
-
-  OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
-     << '\n';
-  OS << "\t\t.amdhsa_private_segment_fixed_size "
-     << KD.private_segment_fixed_size << '\n';
-  OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
-
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_count", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT);
+  auto PrintField = [&](const MCExpr *Expr, uint32_t Shift, uint32_t Mask,
+                        StringRef Directive) {
+    int64_t IVal;
+    OS << "\t\t" << Directive << ' ';
+    const MCExpr *pgm_rsrc1_bits =
+        MCKernelDescriptor::bits_get(Expr, Shift, Mask, getContext());
+    if (pgm_rsrc1_bits->evaluateAsAbsolute(IVal))
+      OS << static_cast<uint64_t>(IVal);
+    else
+      pgm_rsrc1_bits->print(OS, MAI);
+    OS << '\n';
+  };
+
+  OS << "\t\t.amdhsa_group_segment_fixed_size ";
+  KD.group_segment_fixed_size->print(OS, MAI);
+  OS << '\n';
+
+  OS << "\t\t.amdhsa_private_segment_fixed_size ";
+  KD.private_segment_fixed_size->print(OS, MAI);
+  OS << '\n';
+
+  OS << "\t\t.amdhsa_kernarg_size ";
+  KD.kernarg_size->print(OS, MAI);
+  OS << '\n';
+
+  PrintField(
+      KD.compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, ".amdhsa_user_sgpr_count");
 
   if (!hasArchitectedFlatScratch(STI))
-    PRINT_FIELD(
-        OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
-        kernel_code_properties,
-        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_queue_ptr", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+    PrintField(
+        KD.kernel_code_properties,
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
+        ".amdhsa_user_sgpr_private_segment_buffer");
+  PrintField(KD.kernel_code_properties,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR,
+             ".amdhsa_user_sgpr_dispatch_ptr");
+  PrintField(KD.kernel_code_properties,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR,
+             ".amdhsa_user_sgpr_queue_ptr");
+  PrintField(KD.kernel_code_properties,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
+             ".amdhsa_user_sgpr_kernarg_segment_ptr");
+  PrintField(KD.kernel_code_properties,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
+             amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID,
+             ".amdhsa_user_sgpr_dispatch_id");
   if (!hasArchitectedFlatScratch(STI))
-    PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
-                kernel_code_properties,
-                amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+    PrintField(KD.kernel_code_properties,
+               amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
+               amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT,
+               ".amdhsa_user_sgpr_flat_scratch_init");
   if (hasKernargPreload(STI)) {
-    PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_length ", KD,
-                kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_LENGTH);
-    PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_offset ", KD,
-                kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_OFFSET);
+    PrintField(KD.kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_LENGTH_SHIFT,
+               amdhsa::KERNARG_PRELOAD_SPEC_LENGTH,
+               ".amdhsa_user_sgpr_kernarg_preload_length");
+    PrintField(KD.kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_OFFSET_SHIFT,
+               amdhsa::KERNARG_PRELOAD_SPEC_OFFSET,
+               ".amdhsa_user_sgpr_kernarg_preload_offset");
   }
-  PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
-              kernel_code_properties,
-              amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+  PrintField(
+      KD.kernel_code_properties,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
+      amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
+      ".amdhsa_user_sgpr_private_segment_size");
   if (IVersion.Major >= 10)
-    PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
-                kernel_code_properties,
-                amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
+    PrintField(KD.kernel_code_properties,
+               amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
+               amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
+               ".amdhsa_wavefront_size32");
   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
-    PRINT_FIELD(OS, ".amdhsa_uses_dynamic_stack", KD, kernel_code_properties,
-                amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
-  PRINT_FIELD(OS,
-              (hasArchitectedFlatScratch(STI)
-                   ? ".amdhsa_enable_private_segment"
-                   : ".amdhsa_system_sgpr_private_segment_wavefront_offset"),
-              KD, compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
-  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
-  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
-  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
-  PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_info", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
-  PRINT_FIELD(OS, ".amdhsa_system_vgpr_workitem_id", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+    PrintField(KD.kernel_code_properties,
+               amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT,
+               amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK,
+               ".amdhsa_uses_dynamic_stack");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT,
+             (hasArchitectedFlatScratch(STI)
+                  ? ".amdhsa_enable_private_segment"
+                  : ".amdhsa_system_sgpr_private_segment_wavefront_offset"));
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X,
+             ".amdhsa_system_sgpr_workgroup_id_x");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y,
+             ".amdhsa_system_sgpr_workgroup_id_y");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z,
+             ".amdhsa_system_sgpr_workgroup_id_z");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO,
+             ".amdhsa_system_sgpr_workgroup_info");
+  PrintField(KD.compute_pgm_rsrc2,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID,
+             ".amdhsa_system_vgpr_workitem_id");
 
   // These directives are required.
   OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
   OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n';
 
-  if (AMDGPU::isGFX90A(STI))
-    OS << "\t\t.amdhsa_accum_offset " <<
-      (AMDHSA_BITS_GET(KD.compute_pgm_rsrc3,
-                       amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
-      << '\n';
+  if (AMDGPU::isGFX90A(STI)) {
+    // MCExpr equivalent of taking the (accum_offset + 1) * 4.
+    const MCExpr *accum_bits = MCKernelDescriptor::bits_get(
+        KD.compute_pgm_rsrc3,
+        amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
+        amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, getContext());
+    accum_bits = MCBinaryExpr::createAdd(
+        accum_bits, MCConstantExpr::create(1, getContext()), getContext());
+    accum_bits = MCBinaryExpr::createMul(
+        accum_bits, MCConstantExpr::create(4, getContext()), getContext());
+    OS << "\t\t.amdhsa_accum_offset ";
+    int64_t IVal;
+    if (accum_bits->evaluateAsAbsolute(IVal)) {
+      OS << static_cast<uint64_t>(IVal);
+    } else {
+      accum_bits->print(OS, MAI);
+    }
+    OS << '\n';
+  }
 
   if (!ReserveVCC)
     OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
@@ -411,74 +460,105 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     break;
   }
 
-  PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD,
-              compute_pgm_rsrc1,
-              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
-  PRINT_FIELD(OS, ".amdhsa_float_round_mode_16_64", KD,
-              compute_pgm_rsrc1,
-              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
-  PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_32", KD,
-              compute_pgm_rsrc1,
-              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
-  PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_16_64", KD,
-              compute_pgm_rsrc1,
-              amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+  PrintField(KD.compute_pgm_rsrc1,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32,
+             ".amdhsa_float_round_mode_32");
+  PrintField(KD.compute_pgm_rsrc1,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64,
+             ".amdhsa_float_round_mode_16_64");
+  PrintField(KD.compute_pgm_rsrc1,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32,
+             ".amdhsa_float_denorm_mode_32");
+  PrintField(KD.compute_pgm_rsrc1,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64_SHIFT,
+             amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
+             ".amdhsa_float_denorm_mode_16_64");
   if (IVersion.Major < 12) {
-    PRINT_FIELD(OS, ".amdhsa_dx10_clamp", KD, compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP);
-    PRINT_FIELD(OS, ".amdhsa_ieee_mode", KD, compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE);
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP,
+               ".amdhsa_dx10_clamp");
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE,
+               ".amdhsa_ieee_mode");
+  }
+  if (IVersion.Major >= 9) {
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL,
+               ".amdhsa_fp16_overflow");
   }
-  if (IVersion.Major >= 9)
-    PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
-                compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL);
   if (AMDGPU::isGFX90A(STI))
-    PRINT_FIELD(OS, ".amdhsa_tg_split", KD,
-                compute_pgm_rsrc3,
-                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
+    PrintField(KD.compute_pgm_rsrc3,
+               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
   if (IVersion.Major >= 10) {
-    PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD,
-                compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
-    PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD,
-                compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED);
-    PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
-                compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
+               ".amdhsa_workgroup_processor_mode");
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,
+               ".amdhsa_memory_ordered");
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS,
+               ".amdhsa_forward_progress");
   }
   if (IVersion.Major >= 10 && IVersion.Major < 12) {
-    PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3,
-                amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
+    PrintField(KD.compute_pgm_rsrc3,
+               amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT,
+               ".amdhsa_shared_vgpr_count");
   }
-  if (IVersion.Major >= 12)
-    PRINT_FIELD(OS, ".amdhsa_round_robin_scheduling", KD, compute_pgm_rsrc1,
-                amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN);
-  PRINT_FIELD(
-      OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
-      compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
-  PRINT_FIELD(OS, ".amdhsa_exception_fp_denorm_src", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
-  PRINT_FIELD(
-      OS, ".amdhsa_exception_fp_ieee_div_zero", KD,
-      compute_pgm_rsrc2,
-      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
-  PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_overflow", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
-  PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_underflow", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
-  PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_inexact", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
-  PRINT_FIELD(OS, ".amdhsa_exception_int_div_zero", KD,
-              compute_pgm_rsrc2,
-              amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
-#undef PRINT_FIELD
+  if (IVersion.Major >= 12) {
+    PrintField(KD.compute_pgm_rsrc1,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN_SHIFT,
+               amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN,
+               ".amdhsa_round_robin_scheduling");
+  }
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION,
+      ".amdhsa_exception_fp_ieee_invalid_op");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE,
+      ".amdhsa_exception_fp_denorm_src");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::
+          COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO,
+      ".amdhsa_exception_fp_ieee_div_zero");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW,
+      ".amdhsa_exception_fp_ieee_overflow");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW,
+      ".amdhsa_exception_fp_ieee_underflow");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT,
+      ".amdhsa_exception_fp_ieee_inexact");
+  PrintField(
+      KD.compute_pgm_rsrc2,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO_SHIFT,
+      amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
+      ".amdhsa_exception_int_div_zero");
 
   OS << "\t.end_amdhsa_kernel\n";
 }
@@ -835,7 +915,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
 
 void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
     const MCSubtargetInfo &STI, StringRef KernelName,
-    const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
+    const MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR,
     uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
   auto &Streamer = getStreamer();
   auto &Context = Streamer.getContext();
@@ -853,7 +933,7 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
   // Kernel descriptor symbol's type and size are fixed.
   KernelDescriptorSymbol->setType(ELF::STT_OBJECT);
   KernelDescriptorSymbol->setSize(
-      MCConstantExpr::create(sizeof(KernelDescriptor), Context));
+      MCConstantExpr::create(sizeof(amdhsa::kernel_descriptor_t), Context));
 
   // The visibility of the kernel code symbol must be protected or less to allow
   // static relocations from the kernel descriptor to be used.
@@ -861,31 +941,43 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
     KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED);
 
   Streamer.emitLabel(KernelDescriptorSymbol);
-  Streamer.emitInt32(KernelDescriptor.group_segment_fixed_size);
-  Streamer.emitInt32(KernelDescriptor.private_segment_fixed_size);
-  Streamer.emitInt32(KernelDescriptor.kernarg_size);
-
-  for (uint8_t Res : KernelDescriptor.reserved0)
-    Streamer.emitInt8(Res);
+  Streamer.emitValue(
+      KernelDescriptor.group_segment_fixed_size,
+      sizeof(amdhsa::kernel_descriptor_t::group_segment_fixed_size));
+  Streamer.emitValue(
+      KernelDescriptor.private_segment_fixed_size,
+      sizeof(amdhsa::kernel_descriptor_t::private_segment_fixed_size));
+  Streamer.emitValue(KernelDescriptor.kernarg_size,
+                     sizeof(amdhsa::kernel_descriptor_t::kernarg_size));
+
+  for (uint32_t i = 0; i < sizeof(amdhsa::kernel_descriptor_t::reserved0); ++i)
+    Streamer.emitInt8(0u);
 
   // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
   // expression being created is:
   //   (start of kernel code) - (start of kernel descriptor)
   // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64.
-  Streamer.emitValue(MCBinaryExpr::createSub(
-      MCSymbolRefExpr::create(
-          KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
-      MCSymbolRefExpr::create(
-          KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
-      Context),
-      sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
-  for (uint8_t Res : KernelDescriptor.reserved1)
-    Streamer.emitInt8(Res);
-  Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc3);
-  Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc1);
-  Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc2);
-  Streamer.emitInt16(KernelDescriptor.kernel_code_properties);
-  Streamer.emitInt16(KernelDescriptor.kernarg_preload);
-  for (uint8_t Res : KernelDescriptor.reserved3)
-    Streamer.emitInt8(Res);
+  Streamer.emitValue(
+      MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(KernelCodeSymbol,
+                                  MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
+          MCSymbolRefExpr::create(KernelDescriptorSymbol,
+                                  MCSymbolRefExpr::VK_None, Context),
+          Context),
+      sizeof(amdhsa::kernel_descriptor_t::kernel_code_entry_byte_offset));
+  for (uint32_t i = 0; i < sizeof(amdhsa::kernel_descriptor_t::reserved1); ++i)
+    Streamer.emitInt8(0u);
+  Streamer.emitValue(KernelDescriptor.compute_pgm_rsrc3,
+                     sizeof(amdhsa::kernel_descriptor_t::compute_pgm_rsrc3));
+  Streamer.emitValue(KernelDescriptor.compute_pgm_rsrc1,
+                     sizeof(amdhsa::kernel_descriptor_t::compute_pgm_rsrc1));
+  Streamer.emitValue(KernelDescriptor.compute_pgm_rsrc2,
+                     sizeof(amdhsa::kernel_descriptor_t::compute_pgm_rsrc2));
+  Streamer.emitValue(
+      KernelDescriptor.kernel_code_properties,
+      sizeof(amdhsa::kernel_descriptor_t::kernel_code_properties));
+  Streamer.emitValue(KernelDescriptor.kernarg_preload,
+                     sizeof(amdhsa::kernel_descriptor_t::kernarg_preload));
+  for (uint32_t i = 0; i < sizeof(amdhsa::kernel_descriptor_t::reserved3); ++i)
+    Streamer.emitInt8(0u);
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 5aa80ff578c6b6..706897a5dc1f4d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -22,15 +22,13 @@ class MCSymbol;
 class formatted_raw_ostream;
 
 namespace AMDGPU {
+
+struct MCKernelDescriptor;
 namespace HSAMD {
 struct Metadata;
 }
 } // namespace AMDGPU
 
-namespace amdhsa {
-struct kernel_descriptor_t;
-}
-
 class AMDGPUTargetStreamer : public MCTargetStreamer {
   AMDGPUPALMetadata PALMetadata;
 
@@ -94,10 +92,11 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
     return true;
   }
 
-  virtual void EmitAmdhsaKernelDescriptor(
-      const MCSubtargetInfo &STI, StringRef KernelName,
-      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {}
+  virtual void
+  EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
+                             const AMDGPU::MCKernelDescriptor &KernelDescriptor,
+                             uint64_t NextVGPR, uint64_t NextSGPR,
+                             bool ReserveVCC, bool ReserveFlatScr) {}
 
   static StringRef getArchNameFromElfMach(unsigned ElfMach);
   static unsigned getElfMach(StringRef GPU);
@@ -150,10 +149,11 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
                                 bool TrapEnabled) override;
 
-  void EmitAmdhsaKernelDescriptor(
-      const MCSubtargetInfo &STI, StringRef KernelName,
-      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
+  void
+  EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
+                             const AMDGPU::MCKernelDescriptor &KernelDescriptor,
+                             uint64_t NextVGPR, uint64_t NextSGPR,
+                             bool ReserveVCC, bool ReserveFlatScr) override;
 };
 
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -205,10 +205,11 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
                                 bool TrapEnabled) override;
 
-  void EmitAmdhsaKernelDescriptor(
-      const MCSubtargetInfo &STI, StringRef KernelName,
-      const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
+  void
+  EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName,
+                             const AMDGPU::MCKernelDescriptor &KernelDescriptor,
+                             uint64_t NextVGPR, uint64_t NextSGPR,
+                             bool ReserveVCC, bool ReserveFlatScr) override;
 };
 }
 #endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index 0842a58f794b32..14a02b6d8e3687 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_component_library(LLVMAMDGPUDesc
   AMDGPUMCExpr.cpp
   AMDGPUMCTargetDesc.cpp
   AMDGPUTargetStreamer.cpp
+  AMDGPUMCKernelDescriptor.cpp
   R600InstPrinter.cpp
   R600MCCodeEmitter.cpp
   R600MCTargetDesc.cpp
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 7bb84d78442b24..5d44396b07b605 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1221,47 +1221,6 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   }
 }
 
-amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
-    const MCSubtargetInfo *STI) {
-  IsaVersion Version = getIsaVersion(STI->getCPU());
-
-  amdhsa::kernel_descriptor_t KD;
-  memset(&KD, 0, sizeof(KD));
-
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                  amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
-                  amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
-  if (Version.Major >= 12) {
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, 0);
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF, 0);
-  } else {
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, 1);
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, 1);
-  }
-  AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
-                  amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
-  if (Version.Major >= 10) {
-    AMDHSA_BITS_SET(KD.kernel_code_properties,
-                    amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
-                    STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1 : 0);
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
-                    STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1);
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
-                    amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, 1);
-  }
-  if (AMDGPU::isGFX90A(*STI)) {
-    AMDHSA_BITS_SET(KD.compute_pgm_rsrc3,
-                    amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
-                    STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0);
-  }
-  return KD;
-}
-
 bool isGroupSegment(const GlobalValue *GV) {
   return GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f4f9a787100b5b..943588fe701cc8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -34,10 +34,6 @@ class StringRef;
 class Triple;
 class raw_ostream;
 
-namespace amdhsa {
-struct kernel_descriptor_t;
-}
-
 namespace AMDGPU {
 
 struct IsaVersion;
@@ -855,9 +851,6 @@ unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc);
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const MCSubtargetInfo *STI);
 
-amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
-    const MCSubtargetInfo *STI);
-
 bool isGroupSegment(const GlobalValue *GV);
 bool isGlobalSegment(const GlobalValue *GV);
 bool isReadOnlySegment(const GlobalValue *GV);
diff --git a/llvm/test/MC/AMDGPU/hsa-amdgpu-exprs.s b/llvm/test/MC/AMDGPU/hsa-amdgpu-exprs.s
new file mode 100644
index 00000000000000..4623500987be88
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-amdgpu-exprs.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// OBJDUMP:       0000 00000000 0f000000 00000000 00000000
+
+.text
+
+.p2align 8
+.type caller, at function
+caller:
+  s_endpgm
+
+.rodata
+
+.p2align 6
+.amdhsa_kernel caller
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_private_segment_fixed_size max(7, callee1.private_seg_size, callee2.private_seg_size)
+.end_amdhsa_kernel
+
+.set callee1.private_seg_size, 4
+.set callee2.private_seg_size, 15
+
+// ASM: .amdhsa_private_segment_fixed_size max(7, callee1.private_seg_size, callee2.private_seg_size)
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s b/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s
new file mode 100644
index 00000000000000..fab3e893352b21
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-expr-failure.s
@@ -0,0 +1,281 @@
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=ASM %s
+
+// Some expression currently require (immediately) solvable expressions, i.e.,
+// they don't depend on yet-unknown symbolic values.
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type user_sgpr_count, at function
+user_sgpr_count:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_count
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_count defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_count
+
+.p2align 8
+.type user_sgpr_private_segment_buffer, at function
+user_sgpr_private_segment_buffer:
+  s_endpgm
+
+.amdhsa_kernel user_sgpr_private_segment_buffer
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_private_segment_buffer defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_private_segment_buffer
+
+.p2align 8
+.type user_sgpr_kernarg_preload_length, at function
+user_sgpr_kernarg_preload_length:
+  s_endpgm
+
+.amdhsa_kernel user_sgpr_kernarg_preload_length
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_kernarg_preload_length defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_kernarg_preload_length defined_boolean
+
+.p2align 8
+.type user_sgpr_kernarg_preload_offset, at function
+user_sgpr_kernarg_preload_offset:
+  s_endpgm
+
+.amdhsa_kernel user_sgpr_kernarg_preload_offset
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_kernarg_preload_offset defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_kernarg_preload_offset defined_boolean
+
+.p2align 8
+.type user_sgpr_dispatch_ptr, at function
+user_sgpr_dispatch_ptr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_dispatch_ptr
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_dispatch_ptr defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_dispatch_ptr
+
+.p2align 8
+.type user_sgpr_queue_ptr, at function
+user_sgpr_queue_ptr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_queue_ptr
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_queue_ptr defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_queue_ptr
+
+.p2align 8
+.type user_sgpr_kernarg_segment_ptr, at function
+user_sgpr_kernarg_segment_ptr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_kernarg_segment_ptr
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_kernarg_segment_ptr defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_kernarg_segment_ptr
+
+.p2align 8
+.type user_sgpr_dispatch_id, at function
+user_sgpr_dispatch_id:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_dispatch_id
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_dispatch_id defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_dispatch_id
+
+.p2align 8
+.type user_sgpr_flat_scratch_init, at function
+user_sgpr_flat_scratch_init:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_flat_scratch_init
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_flat_scratch_init defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_flat_scratch_init
+
+.p2align 8
+.type user_sgpr_private_segment_size, at function
+user_sgpr_private_segment_size:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel user_sgpr_private_segment_size
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_user_sgpr_private_segment_size defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_user_sgpr_private_segment_size
+
+.p2align 8
+.type wavefront_size32, at function
+wavefront_size32:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel wavefront_size32
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_wavefront_size32 defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_wavefront_size32
+
+.p2align 8
+.type next_free_vgpr, at function
+next_free_vgpr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel next_free_vgpr
+  .amdhsa_next_free_vgpr defined_boolean
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_next_free_vgpr
+
+.p2align 8
+.type next_free_sgpr, at function
+next_free_sgpr:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel next_free_sgpr
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr defined_boolean
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_next_free_sgpr
+
+.p2align 8
+.type accum_offset, at function
+accum_offset:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel accum_offset
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_accum_offset
+
+.p2align 8
+.type reserve_vcc, at function
+reserve_vcc:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel reserve_vcc
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_reserve_vcc defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_reserve_vcc
+
+.p2align 8
+.type reserve_flat_scratch, at function
+reserve_flat_scratch:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel reserve_flat_scratch
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_reserve_flat_scratch defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_reserve_flat_scratch
+
+.p2align 8
+.type shared_vgpr_count, at function
+shared_vgpr_count:
+  s_endpgm
+
+.p2align 6
+.amdhsa_kernel shared_vgpr_count
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+  .amdhsa_shared_vgpr_count defined_boolean
+.end_amdhsa_kernel
+
+// ASM: error: directive should have resolvable expression
+// ASM-NEXT:   .amdhsa_shared_vgpr_count
+
+.set defined_boolean, 1
+
+// ASM:       .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
new file mode 100644
index 00000000000000..95af59c413ae6d
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
@@ -0,0 +1,190 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f0afe4 801f007f 000c0000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f0afe4 801f007f 000c0000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later, at function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined, at function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&1)>>0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&32)>>5
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_wavefront_size32 (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&1024)>>10
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2147483648)>>31
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
new file mode 100644
index 00000000000000..e1107fb69ba410
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
@@ -0,0 +1,186 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f0afe4 811f007f 000c0000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f0afe4 811f007f 000c0000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later, at function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined, at function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_enable_private_segment defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_enable_private_segment defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_wavefront_size32 (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&1024)>>10
+// ASM-NEXT: .amdhsa_enable_private_segment (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(0<<0))&(~960))|(0<<6))&2147483648)>>31
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_enable_private_segment 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s
new file mode 100644
index 00000000000000..449616d35186b7
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s
@@ -0,0 +1,184 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f02fe4 811f007f 000c0000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f02fe4 811f007f 000c0000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later, at function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined, at function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_round_robin_scheduling defined_boolean
+  .amdhsa_enable_private_segment defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_workgroup_processor_mode defined_boolean
+  .amdhsa_memory_ordered defined_boolean
+  .amdhsa_forward_progress defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_round_robin_scheduling defined_boolean
+  .amdhsa_enable_private_segment defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_wavefront_size32 (((((0&(~1024))|(1<<10))&(~2048))|(defined_boolean<<11))&1024)>>10
+// ASM-NEXT: .amdhsa_enable_private_segment (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&2147483648)>>31
+// ASM-NEXT: .amdhsa_round_robin_scheduling (((((((((((((((((((((((((((((0&(~786432))|(3<<18))&(~536870912))|(1<<29))&(~1073741824))|(1<<30))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_enable_private_segment 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_round_robin_scheduling 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s
new file mode 100644
index 00000000000000..c7e05441b45ff7
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s
@@ -0,0 +1,168 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f0af00 801f007f 00080000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f0af00 801f007f 00080000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later, at function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined, at function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer (((0&(~2048))|(defined_boolean<<11))&1)>>0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((0&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((0&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((0&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((0&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init (((0&(~2048))|(defined_boolean<<11))&32)>>5
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((0&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s
new file mode 100644
index 00000000000000..49a5015987a651
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s
@@ -0,0 +1,171 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck --check-prefix=ASM %s
+
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx801 -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00f0af00 801f007f 00080000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 00f0af00 801f007f 00080000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later, at function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined, at function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_group_segment_fixed_size defined_value+2
+  .amdhsa_private_segment_fixed_size defined_value+3
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+.set defined_value, 41
+.set defined_2_bits, 3
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_group_segment_fixed_size defined_value+1
+  .amdhsa_private_segment_fixed_size defined_value+2
+  .amdhsa_system_vgpr_workitem_id defined_2_bits
+  .amdhsa_float_round_mode_32 defined_2_bits
+  .amdhsa_float_round_mode_16_64 defined_2_bits
+  .amdhsa_float_denorm_mode_32 defined_2_bits
+  .amdhsa_float_denorm_mode_16_64 defined_2_bits
+  .amdhsa_system_sgpr_workgroup_id_x defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_y defined_boolean
+  .amdhsa_system_sgpr_workgroup_id_z defined_boolean
+  .amdhsa_system_sgpr_workgroup_info defined_boolean
+  .amdhsa_exception_fp_ieee_invalid_op defined_boolean
+  .amdhsa_exception_fp_denorm_src defined_boolean
+  .amdhsa_exception_fp_ieee_div_zero defined_boolean
+  .amdhsa_exception_fp_ieee_overflow defined_boolean
+  .amdhsa_exception_fp_ieee_underflow defined_boolean
+  .amdhsa_exception_fp_ieee_inexact defined_boolean
+  .amdhsa_exception_int_div_zero defined_boolean
+  .amdhsa_uses_dynamic_stack defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size defined_value+2
+// ASM-NEXT: .amdhsa_private_segment_fixed_size defined_value+3
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer (((0&(~2048))|(defined_boolean<<11))&1)>>0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr (((0&(~2048))|(defined_boolean<<11))&2)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr (((0&(~2048))|(defined_boolean<<11))&4)>>2
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr (((0&(~2048))|(defined_boolean<<11))&8)>>3
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id (((0&(~2048))|(defined_boolean<<11))&16)>>4
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init (((0&(~2048))|(defined_boolean<<11))&32)>>5
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size (((0&(~2048))|(defined_boolean<<11))&64)>>6
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~12288))|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((((0&(~128))|(1<<7))&(~6144))|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_value, 41
+// ASM-NEXT:  .no_dead_strip defined_value
+// ASM-NEXT:  .set defined_2_bits, 3
+// ASM-NEXT:  .no_dead_strip defined_2_bits
+// ASM-NEXT:  .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 42
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 43
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 3
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 3
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 3
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s
new file mode 100644
index 00000000000000..b7f89239160fcf
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s
@@ -0,0 +1,148 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// When going from asm -> asm, the expressions should remain the same (i.e., symbolic).
+// When going from asm -> obj, the expressions should get resolved (through fixups),
+
+// OBJDUMP: Contents of section .rodata
+// expr_defined_later
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0030 0000ac04 81000000 00000000 00000000
+// expr_defined
+// OBJDUMP-NEXT: 0040 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0070 0000ac04 81000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type expr_defined_later, at function
+expr_defined_later:
+  s_endpgm
+
+.p2align 8
+.type expr_defined, at function
+expr_defined:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel expr_defined_later
+  .amdhsa_system_sgpr_private_segment_wavefront_offset defined_boolean
+  .amdhsa_dx10_clamp defined_boolean
+  .amdhsa_ieee_mode defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_tg_split defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+.set defined_boolean, 1
+
+.p2align 6
+.amdhsa_kernel expr_defined
+  .amdhsa_system_sgpr_private_segment_wavefront_offset defined_boolean
+  .amdhsa_dx10_clamp defined_boolean
+  .amdhsa_ieee_mode defined_boolean
+  .amdhsa_fp16_overflow defined_boolean
+  .amdhsa_tg_split defined_boolean
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel expr_defined_later
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&62)>>1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1)>>0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&128)>>7
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&256)>>8
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&512)>>9
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1024)>>10
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&6144)>>11
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset (((((((0&(~65536))|(defined_boolean<<16))&(~63))|(0<<0))&63)>>0)+1)*4
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((0&(~786432))|(3<<18))&(~2097152))|(1<<21))&(~8388608))|(1<<23))&(~2097152))|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(0<<0))&(~960))|(0<<6))&67108864)>>26
+// ASM-NEXT: .amdhsa_tg_split (((((0&(~65536))|(defined_boolean<<16))&(~63))|(0<<0))&65536)>>16
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&16777216)>>24
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&33554432)>>25
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&67108864)>>26
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&134217728)>>27
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&268435456)>>28
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&536870912)>>29
+// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((0&(~128))|(1<<7))&(~1))|(defined_boolean<<0))&(~62))|(0<<1))&1073741824)>>30
+// ASM-NEXT: .end_amdhsa_kernel
+
+// ASM:       .set defined_boolean, 1
+// ASM-NEXT:  .no_dead_strip defined_boolean
+
+// ASM: .amdhsa_kernel expr_defined
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 0
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 0
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 0
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_tg_split 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 0
+// ASM-NEXT: .amdhsa_exception_int_div_zero 0
+// ASM-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/MC/AMDGPU/hsa-tg-split.s b/llvm/test/MC/AMDGPU/hsa-tg-split.s
new file mode 100644
index 00000000000000..5a4d3e2c279c50
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-tg-split.s
@@ -0,0 +1,74 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack,+tgsplit < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack,+tgsplit -filetype=obj < %s > %t
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// OBJDUMP: Contents of section .rodata
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+"
+
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
+.p2align 8
+.type minimal, at function
+minimal:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+.p2align 6
+.amdhsa_kernel minimal
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 0
+// ASM-NEXT: .amdhsa_kernarg_size 0
+// ASM-NEXT: .amdhsa_user_sgpr_count 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 0
+// ASM-NEXT: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 0
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 0
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 0
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3
+// ASM-NEXT: .amdhsa_dx10_clamp 1
+// ASM-NEXT: .amdhsa_ieee_mode 1
+// ASM-NEXT: .amdhsa_fp16_overflow 0
+// ASM-NEXT: .amdhsa_tg_split 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 0
+// ASM-NEXT: .amdhsa_exception_int_div_zero 0
+// ASM-NEXT: .end_amdhsa_kernel

>From 408c36522f7eb8638314b584995daf5790968842 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Wed, 27 Mar 2024 12:00:32 +0000
Subject: [PATCH 13/54] [gn build] Port 1103a2a337e9

---
 .../gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
index 12d875cf40c98b..5ba91fcec83a0d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
@@ -104,6 +104,7 @@ static_library("MCTargetDesc") {
     "AMDGPUMCAsmInfo.cpp",
     "AMDGPUMCCodeEmitter.cpp",
     "AMDGPUMCExpr.cpp",
+    "AMDGPUMCKernelDescriptor.cpp",
     "AMDGPUMCTargetDesc.cpp",
     "AMDGPUTargetStreamer.cpp",
     "R600InstPrinter.cpp",

>From 51388fbab1b9454dfe24c4ac1c8b4a009162386a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 26 Mar 2024 18:37:15 +0000
Subject: [PATCH 14/54] [DAG] visitSub - reuse existing SDLoc instead of
 regenerating it. NFC.

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 39ee95d7007ce6..a021e0e19fc3cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3884,7 +3884,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue V = foldSubToAvg(N, DL))
     return V;
 
-  if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
+  if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, DL))
     return V;
 
   if (SDValue V = foldSubToUSubSat(VT, N, DL))
@@ -3949,7 +3949,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0))
         if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
           if (C->getAPIntValue() == (BitWidth - 1))
-            return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
+            return DAG.getNode(ISD::ABS, DL, VT, S0);
     }
   }
 

>From 9247f3185c7e1f7a2c1071fa61e283deb21091aa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 26 Mar 2024 18:41:47 +0000
Subject: [PATCH 15/54] [DAG] foldAddSubOfSignBit - reuse existing SDLoc
 instead of regenerating it. NFC.

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a021e0e19fc3cf..36abe27d262176 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2555,7 +2555,8 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) {
 
 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
 /// a shift and add with a different constant.
-static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
+static SDValue foldAddSubOfSignBit(SDNode *N, const SDLoc &DL,
+                                   SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
          "Expecting add or sub");
 
@@ -2583,7 +2584,6 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
   // Eliminate the 'not' by adjusting the shift and add/sub constant:
   // add (srl (not X), 31), C --> add (sra X, 31), (C + 1)
   // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1)
-  SDLoc DL(N);
   if (SDValue NewC = DAG.FoldConstantArithmetic(
           IsAdd ? ISD::ADD : ISD::SUB, DL, VT,
           {ConstantOp, DAG.getConstant(1, DL, VT)})) {
@@ -2878,7 +2878,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (SDValue V = foldAddSubBoolOfMaskedVal(N, DL, DAG))
     return V;
 
-  if (SDValue V = foldAddSubOfSignBit(N, DAG))
+  if (SDValue V = foldAddSubOfSignBit(N, DL, DAG))
     return V;
 
   // Try to match AVGFLOOR fixedwidth pattern
@@ -3877,7 +3877,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue V = foldAddSubBoolOfMaskedVal(N, DL, DAG))
     return V;
 
-  if (SDValue V = foldAddSubOfSignBit(N, DAG))
+  if (SDValue V = foldAddSubOfSignBit(N, DL, DAG))
     return V;
 
   // Try to match AVGCEIL fixedwidth pattern

>From 875aed17b978bf58a01d31572af6964e91a9f641 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Mar 2024 12:13:03 +0000
Subject: [PATCH 16/54] [X86] Add combineExtractFromVectorLoad helper - pulled
 out of combineExtractVectorElt

Prep work for #85419 to make it easier to reuse in other combines
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 78 ++++++++++++++++---------
 1 file changed, 49 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9bad38f97c6a29..4cd0bebe01bb48 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43995,6 +43995,49 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
                      Extract->getOperand(1));
 }
 
+// If this extract is from a loaded vector value and will be used as an
+// integer, that requires a potentially expensive XMM -> GPR transfer.
+// Additionally, if we can convert to a scalar integer load, that will likely
+// be folded into a subsequent integer op.
+// Note: Unlike the related fold for this in DAGCombiner, this is not limited
+//       to a single-use of the loaded vector. For the reasons above, we
+//       expect this to be profitable even if it creates an extra load.
+static SDValue
+combineExtractFromVectorLoad(SDNode *N, SDValue InputVector, uint64_t Idx,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+         "Only EXTRACT_VECTOR_ELT supported so far");
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT SrcVT = InputVector.getValueType();
+  EVT VT = N->getValueType(0);
+
+  bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
+    return Use->getOpcode() == ISD::STORE ||
+           Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
+           Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
+  });
+
+  auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
+  if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
+      SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
+      !LikelyUsedAsVector && LoadVec->isSimple()) {
+    SDValue NewPtr = TLI.getVectorElementPointer(
+        DAG, LoadVec->getBasePtr(), SrcVT, DAG.getVectorIdxConstant(Idx, dl));
+    unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
+    MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
+    Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
+    SDValue Load =
+        DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
+                    LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
+    DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
+    return Load;
+  }
+
+  return SDValue();
+}
+
 // Attempt to peek through a target shuffle and extract the scalar from the
 // source.
 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
@@ -44600,6 +44643,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
     return V;
 
+  if (CIdx)
+    if (SDValue V = combineExtractFromVectorLoad(
+            N, InputVector, CIdx->getZExtValue(), dl, DAG, DCI))
+      return V;
+
   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
   // and then testing the relevant element.
   //
@@ -44645,34 +44693,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // If this extract is from a loaded vector value and will be used as an
-  // integer, that requires a potentially expensive XMM -> GPR transfer.
-  // Additionally, if we can convert to a scalar integer load, that will likely
-  // be folded into a subsequent integer op.
-  // Note: Unlike the related fold for this in DAGCombiner, this is not limited
-  //       to a single-use of the loaded vector. For the reasons above, we
-  //       expect this to be profitable even if it creates an extra load.
-  bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
-    return Use->getOpcode() == ISD::STORE ||
-           Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
-           Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
-  });
-  auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
-  if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
-      SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
-      !LikelyUsedAsVector && LoadVec->isSimple()) {
-    SDValue NewPtr =
-        TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
-    unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
-    MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
-    Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
-    SDValue Load =
-        DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
-                    LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
-    DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
-    return Load;
-  }
-
   return SDValue();
 }
 
@@ -48273,7 +48293,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
 
   // We do not split for SSE at all, but we need to split vectors for AVX1 and
   // AVX2.
-  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && 
+  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
       TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) {
     SDValue LoX, HiX;
     std::tie(LoX, HiX) = splitVector(X, DAG, DL);

>From e82765bf07a978674c0e75c8b2e20f154ae24a4c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Mar 2024 12:18:54 +0000
Subject: [PATCH 17/54] [X86] masked_store.ll - add nounwind to remove cfi
 noise

---
 llvm/test/CodeGen/X86/masked_store.ll | 79 +++++++++++----------------
 1 file changed, 33 insertions(+), 46 deletions(-)

diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 898b34e969b1d2..03245ea31730e4 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -12,7 +12,7 @@
 ; vXf64
 ;
 
-define void @store_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %val) {
+define void @store_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %val) nounwind {
 ; SSE-LABEL: store_v1f64_v1i64:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    testq %rdi, %rdi
@@ -46,7 +46,7 @@ define void @store_v1f64_v1i64(<1 x i64> %trigger, ptr %addr, <1 x double> %val)
   ret void
 }
 
-define void @store_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %val) {
+define void @store_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %val) nounwind {
 ; SSE-LABEL: store_v2f64_v2i64:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movmskpd %xmm0, %eax
@@ -106,7 +106,7 @@ define void @store_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %val)
   ret void
 }
 
-define void @store_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %val) {
+define void @store_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %val) nounwind {
 ; SSE2-LABEL: store_v4f64_v4i64:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
@@ -222,7 +222,7 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %val)
 ; vXf32
 ;
 
-define void @store_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+define void @store_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %val) nounwind {
 ; SSE2-LABEL: store_v2f32_v2i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -314,7 +314,7 @@ define void @store_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %val)
   ret void
 }
 
-define void @store_v4f32_v4i32(<4 x float> %x, ptr %ptr, <4 x float> %y, <4 x i32> %mask) {
+define void @store_v4f32_v4i32(<4 x float> %x, ptr %ptr, <4 x float> %y, <4 x i32> %mask) nounwind {
 ; SSE2-LABEL: store_v4f32_v4i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movmskps %xmm2, %eax
@@ -425,7 +425,7 @@ define void @store_v4f32_v4i32(<4 x float> %x, ptr %ptr, <4 x float> %y, <4 x i3
   ret void
 }
 
-define void @store_v8f32_v8i32(<8 x float> %x, ptr %ptr, <8 x float> %y, <8 x i32> %mask) {
+define void @store_v8f32_v8i32(<8 x float> %x, ptr %ptr, <8 x float> %y, <8 x i32> %mask) nounwind {
 ; SSE2-LABEL: store_v8f32_v8i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
@@ -605,7 +605,7 @@ define void @store_v8f32_v8i32(<8 x float> %x, ptr %ptr, <8 x float> %y, <8 x i3
   ret void
 }
 
-define void @store_v16f32_v16i32(<16 x float> %x, ptr %ptr, <16 x float> %y, <16 x i32> %mask) {
+define void @store_v16f32_v16i32(<16 x float> %x, ptr %ptr, <16 x float> %y, <16 x i32> %mask) nounwind {
 ; SSE2-LABEL: store_v16f32_v16i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4
@@ -914,7 +914,7 @@ define void @store_v16f32_v16i32(<16 x float> %x, ptr %ptr, <16 x float> %y, <16
 ; vXi64
 ;
 
-define void @store_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %val) {
+define void @store_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %val) nounwind {
 ; SSE2-LABEL: store_v2i64_v2i64:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movmskpd %xmm0, %eax
@@ -998,7 +998,7 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %val) {
   ret void
 }
 
-define void @store_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %val) {
+define void @store_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %val) nounwind {
 ; SSE2-LABEL: store_v4i64_v4i64:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
@@ -1122,7 +1122,7 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %val) {
 ; vXi32
 ;
 
-define void @store_v1i32_v1i32(<1 x i32> %trigger, ptr %addr, <1 x i32> %val) {
+define void @store_v1i32_v1i32(<1 x i32> %trigger, ptr %addr, <1 x i32> %val) nounwind {
 ; SSE-LABEL: store_v1i32_v1i32:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    testl %edi, %edi
@@ -1156,7 +1156,7 @@ define void @store_v1i32_v1i32(<1 x i32> %trigger, ptr %addr, <1 x i32> %val) {
   ret void
 }
 
-define void @store_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+define void @store_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) nounwind {
 ; SSE2-LABEL: store_v2i32_v2i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -1256,7 +1256,7 @@ define void @store_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
   ret void
 }
 
-define void @store_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+define void @store_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) nounwind {
 ; SSE2-LABEL: store_v4i32_v4i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -1370,7 +1370,7 @@ define void @store_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
   ret void
 }
 
-define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) {
+define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) nounwind {
 ; SSE2-LABEL: store_v8i32_v8i32:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -1560,7 +1560,7 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) {
 ; vXi16
 ;
 
-define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) {
+define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) nounwind {
 ; SSE2-LABEL: store_v8i16_v8i16:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -1907,7 +1907,7 @@ define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) {
   ret void
 }
 
-define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val) {
+define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val) nounwind {
 ; SSE2-LABEL: store_v16i16_v16i16:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -2676,7 +2676,7 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val
 ; vXi8
 ;
 
-define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) {
+define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) nounwind {
 ; SSE2-LABEL: store_v16i8_v16i8:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -3273,7 +3273,7 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) {
   ret void
 }
 
-define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) {
+define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) nounwind {
 ; SSE2-LABEL: store_v32i8_v32i8:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -4670,7 +4670,7 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) {
 
 ;;; Stores with Constant Masks
 
-define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) nounwind {
 ; SSE-LABEL: mstore_constmask_v4i32_v4i32:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movups %xmm1, (%rdi)
@@ -4693,7 +4693,7 @@ define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i3
 
 ; Make sure we are able to detect all ones constant mask after type legalization
 ; to avoid masked stores.
-define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 x i64> %val) {
+define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 x i64> %val) nounwind {
 ; SSE2-LABEL: mstore_constmask_allones_split:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
@@ -4810,7 +4810,7 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16
 
 ;  When only one element of the mask is set, reduce to a scalar store.
 
-define void @one_mask_bit_set1(ptr %addr, <4 x i32> %val) {
+define void @one_mask_bit_set1(ptr %addr, <4 x i32> %val) nounwind {
 ; SSE-LABEL: one_mask_bit_set1:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movss %xmm0, (%rdi)
@@ -4832,7 +4832,7 @@ define void @one_mask_bit_set1(ptr %addr, <4 x i32> %val) {
 
 ; Choose a different element to show that the correct address offset is produced.
 
-define void @one_mask_bit_set2(ptr %addr, <4 x float> %val) {
+define void @one_mask_bit_set2(ptr %addr, <4 x float> %val) nounwind {
 ; SSE2-LABEL: one_mask_bit_set2:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -4860,7 +4860,7 @@ define void @one_mask_bit_set2(ptr %addr, <4 x float> %val) {
 
 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
 
-define void @one_mask_bit_set3(ptr %addr, <4 x i64> %val) {
+define void @one_mask_bit_set3(ptr %addr, <4 x i64> %val) nounwind {
 ; SSE-LABEL: one_mask_bit_set3:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movlps %xmm1, 16(%rdi)
@@ -4886,7 +4886,7 @@ define void @one_mask_bit_set3(ptr %addr, <4 x i64> %val) {
 
 ; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
 
-define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) {
+define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) nounwind {
 ; SSE-LABEL: one_mask_bit_set4:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movhps %xmm1, 24(%rdi)
@@ -4912,7 +4912,7 @@ define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) {
 
 ; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
 
-define void @one_mask_bit_set5(ptr %addr, <8 x double> %val) {
+define void @one_mask_bit_set5(ptr %addr, <8 x double> %val) nounwind {
 ; SSE-LABEL: one_mask_bit_set5:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movlps %xmm3, 48(%rdi)
@@ -4944,7 +4944,7 @@ define void @one_mask_bit_set5(ptr %addr, <8 x double> %val) {
 }
 
 ; Try one elt in each half of a vector that needs to split
-define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) {
+define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) nounwind {
 ; SSE2-LABEL: one_mask_bit_set6:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movlps %xmm3, 48(%rdi)
@@ -4999,7 +4999,7 @@ define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) {
   ret void
 }
 
-define void @top_bits_unset_stack() {
+define void @top_bits_unset_stack() nounwind {
 ; SSE-LABEL: top_bits_unset_stack:
 ; SSE:       ## %bb.0: ## %entry
 ; SSE-NEXT:    xorps %xmm0, %xmm0
@@ -5047,7 +5047,6 @@ define void @top_bits_unset_stack() {
 ; X86-AVX512-LABEL: top_bits_unset_stack:
 ; X86-AVX512:       ## %bb.0: ## %entry
 ; X86-AVX512-NEXT:    subl $76, %esp
-; X86-AVX512-NEXT:    .cfi_def_cfa_offset 80
 ; X86-AVX512-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    movb $63, %al
 ; X86-AVX512-NEXT:    kmovd %eax, %k1
@@ -5064,7 +5063,7 @@ entry:
 
 ; SimplifyDemandedBits eliminates an ashr here.
 
-define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, ptr %p, <4 x i32> %masksrc) {
+define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, ptr %p, <4 x i32> %masksrc) nounwind {
 ; SSE-LABEL: masked_store_bool_mask_demand_trunc_sext:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    pslld $31, %xmm2
@@ -5160,7 +5159,7 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, ptr %p, <
 
 ; PR26697
 
-define void @one_mask_bit_set1_variable(ptr %addr, <4 x float> %val, <4 x i32> %mask) {
+define void @one_mask_bit_set1_variable(ptr %addr, <4 x float> %val, <4 x i32> %mask) nounwind {
 ; SSE2-LABEL: one_mask_bit_set1_variable:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movmskps %xmm1, %eax
@@ -5267,7 +5266,7 @@ define void @one_mask_bit_set1_variable(ptr %addr, <4 x float> %val, <4 x i32> %
 ; This needs to be widened to v4i32.
 ; This used to assert in type legalization. PR38436
 ; FIXME: The codegen for AVX512 should use KSHIFT to zero the upper bits of the mask.
-define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
+define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) nounwind {
 ; SSE2-LABEL: widen_masked_store:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    andb $1, %sil
@@ -5448,7 +5447,7 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) {
   ret void
 }
 
-define void @zero_mask(ptr %addr, <2 x double> %val) {
+define void @zero_mask(ptr %addr, <2 x double> %val) nounwind {
 ; SSE-LABEL: zero_mask:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    retq
@@ -5464,7 +5463,7 @@ define void @zero_mask(ptr %addr, <2 x double> %val) {
   ret void
 }
 
-define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask) {
+define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask) nounwind {
 ; SSE2-LABEL: PR11210:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movmskps %xmm2, %eax
@@ -5638,7 +5637,7 @@ define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask)
   ret void
 }
 
-define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigger.ptr, ptr %val.ptr, ptr %dst) {
+define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigger.ptr, ptr %val.ptr, ptr %dst) nounwind {
 ; SSE2-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
 ; SSE2:       ## %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm6
@@ -5874,23 +5873,11 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
 ; SSE4-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
 ; SSE4:       ## %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movdqa (%rdi), %xmm1
 ; SSE4-NEXT:    movdqa 32(%rdi), %xmm2
 ; SSE4-NEXT:    movdqa 64(%rdi), %xmm0
@@ -6266,7 +6253,7 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
 }
 
 ; From https://reviews.llvm.org/rGf8d9097168b7#1165311
-define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) #0 {
+define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) nounwind {
 ; SSE2-LABEL: undefshuffle:
 ; SSE2:       ## %bb.0: ## %else
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)

>From b8cc838427efa80eb5ca4ec7c8adb53e4adfc4c7 Mon Sep 17 00:00:00 2001
From: komalverma04 <komal148btit21 at igdtuw.ac.in>
Date: Wed, 27 Mar 2024 05:51:27 -0700
Subject: [PATCH 18/54] [analyzer][docs] Document the
 `optin.performance.Padding` checker (#86411)

Closes #73675

Co-authored-by: Balazs Benics <benicsbalazs at gmail.com>
Co-authored-by: NagyDonat <donat.nagy at ericsson.com>
---
 clang/docs/analyzer/checkers.rst              | 83 ++++++++++++++++++-
 .../clang/StaticAnalyzer/Checkers/Checkers.td |  2 +-
 2 files changed, 82 insertions(+), 3 deletions(-)

diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 66da1c7b35f28b..8af99a021ebdfd 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -849,10 +849,89 @@ Check for performance anti-patterns when using Grand Central Dispatch.
 
 .. _optin-performance-Padding:
 
-optin.performance.Padding
-"""""""""""""""""""""""""
+optin.performance.Padding (C, C++, ObjC)
+""""""""""""""""""""""""""""""""""""""""
 Check for excessively padded structs.
 
+This checker detects structs with excessive padding, which can lead to wasted
+memory thus decreased performance by reducing the effectiveness of the
+processor cache. Padding bytes are added by compilers to align data accesses
+as some processors require data to be aligned to certain boundaries. On others,
+unaligned data access are possible, but impose significantly larger latencies.
+
+To avoid padding bytes, the fields of a struct should be ordered by decreasing
+by alignment. Usually, its easier to think of the ``sizeof`` of the fields, and
+ordering the fields by ``sizeof`` would usually also lead to the same optimal
+layout.
+
+In rare cases, one can use the ``#pragma pack(1)`` directive to enforce a packed
+layout too, but it can significantly increase the access times, so reordering the
+fields is usually a better solution.
+
+
+.. code-block:: cpp
+
+ // warn: Excessive padding in 'struct NonOptimal' (35 padding bytes, where 3 is optimal)
+ struct NonOptimal {
+   char c1;
+   // 7 bytes of padding
+   std::int64_t big1; // 8 bytes
+   char c2;
+   // 7 bytes of padding
+   std::int64_t big2; // 8 bytes
+   char c3;
+   // 7 bytes of padding
+   std::int64_t big3; // 8 bytes
+   char c4;
+   // 7 bytes of padding
+   std::int64_t big4; // 8 bytes
+   char c5;
+   // 7 bytes of padding
+ };
+ static_assert(sizeof(NonOptimal) == 4*8+5+5*7);
+
+ // no-warning: The fields are nicely aligned to have the minimal amount of padding bytes.
+ struct Optimal {
+   std::int64_t big1; // 8 bytes
+   std::int64_t big2; // 8 bytes
+   std::int64_t big3; // 8 bytes
+   std::int64_t big4; // 8 bytes
+   char c1;
+   char c2;
+   char c3;
+   char c4;
+   char c5;
+   // 3 bytes of padding
+ };
+ static_assert(sizeof(Optimal) == 4*8+5+3);
+
+ // no-warning: Bit packing representation is also accepted by this checker, but
+ // it can significantly increase access times, so prefer reordering the fields.
+ #pragma pack(1)
+ struct BitPacked {
+   char c1;
+   std::int64_t big1; // 8 bytes
+   char c2;
+   std::int64_t big2; // 8 bytes
+   char c3;
+   std::int64_t big3; // 8 bytes
+   char c4;
+   std::int64_t big4; // 8 bytes
+   char c5;
+ };
+ static_assert(sizeof(BitPacked) == 4*8+5);
+
+The ``AllowedPad`` option can be used to specify a threshold for the number
+padding bytes raising the warning. If the number of padding bytes of the struct
+and the optimal number of padding bytes differ by more than the threshold value,
+a warning will be raised.
+
+By default, the ``AllowedPad`` threshold is 24 bytes.
+
+To override this threshold to e.g. 4 bytes, use the
+``-analyzer-config optin.performance.Padding:AllowedPad=4`` option.
+
+
 .. _optin-portability-UnixAPI:
 
 optin.portability.UnixAPI
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index bf46766d44b391..5fe5c9286dabb7 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -908,7 +908,7 @@ def PaddingChecker : Checker<"Padding">,
                   "24",
                   Released>
   ]>,
-  Documentation<NotDocumented>;
+  Documentation<HasDocumentation>;
 
 } // end: "padding"
 

>From 4f9aab2b500d3df0cc5d54f2d29c8199507af66c Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve at amd.com>
Date: Wed, 27 Mar 2024 13:53:36 +0100
Subject: [PATCH 19/54] [NFC][TableGen][GlobalISel] Move MIR pattern parsing
 out of combiner (#86789)

Reland of cfa0833ccc7450a322e709583e894e4c96ce682e
---
 llvm/utils/TableGen/Common/CMakeLists.txt     |   2 +
 .../Common/GlobalISel/CombinerUtils.cpp       |  23 +
 .../Common/GlobalISel/CombinerUtils.h         |   4 +
 .../Common/GlobalISel/PatternParser.cpp       | 462 +++++++++++++++++
 .../Common/GlobalISel/PatternParser.h         | 118 +++++
 .../TableGen/GlobalISelCombinerEmitter.cpp    | 479 +-----------------
 6 files changed, 623 insertions(+), 465 deletions(-)
 create mode 100644 llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.cpp
 create mode 100644 llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp
 create mode 100644 llvm/utils/TableGen/Common/GlobalISel/PatternParser.h

diff --git a/llvm/utils/TableGen/Common/CMakeLists.txt b/llvm/utils/TableGen/Common/CMakeLists.txt
index 0440f027f28659..c31ed5a1de690c 100644
--- a/llvm/utils/TableGen/Common/CMakeLists.txt
+++ b/llvm/utils/TableGen/Common/CMakeLists.txt
@@ -12,10 +12,12 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_library(LLVMTableGenCommon STATIC OBJECT EXCLUDE_FROM_ALL
   GlobalISel/CodeExpander.cpp
+  GlobalISel/CombinerUtils.cpp
   GlobalISel/CXXPredicates.cpp
   GlobalISel/GlobalISelMatchTable.cpp
   GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp
   GlobalISel/MatchDataInfo.cpp
+  GlobalISel/PatternParser.cpp
   GlobalISel/Patterns.cpp
 
   AsmWriterInst.cpp
diff --git a/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.cpp b/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.cpp
new file mode 100644
index 00000000000000..37e6306050951b
--- /dev/null
+++ b/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.cpp
@@ -0,0 +1,23 @@
+//===- CombinerUtils.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CombinerUtils.h"
+#include "llvm/ADT/StringSet.h"
+
+namespace llvm {
+
+StringRef insertStrRef(StringRef S) {
+  if (S.empty())
+    return {};
+
+  static StringSet<> Pool;
+  auto [It, Inserted] = Pool.insert(S);
+  return It->getKey();
+}
+
+} // namespace llvm
diff --git a/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.h b/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.h
index 8cb2514a10e876..82a64c63edbde3 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/CombinerUtils.h
@@ -65,6 +65,10 @@ inline const DagInit *getDagWithOperatorOfSubClass(const Init &N,
         return I;
   return nullptr;
 }
+
+/// Copies a StringRef into a static pool to preserve it.
+StringRef insertStrRef(StringRef S);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp
new file mode 100644
index 00000000000000..1d6c4c73a26405
--- /dev/null
+++ b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.cpp
@@ -0,0 +1,462 @@
+//===- PatternParser.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Common/GlobalISel/PatternParser.h"
+#include "Basic/CodeGenIntrinsics.h"
+#include "Common/CodeGenTarget.h"
+#include "Common/GlobalISel/CombinerUtils.h"
+#include "Common/GlobalISel/Patterns.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/SaveAndRestore.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+
+namespace llvm {
+namespace gi {
+static constexpr StringLiteral MIFlagsEnumClassName = "MIFlagEnum";
+
+namespace {
+class PrettyStackTraceParse : public PrettyStackTraceEntry {
+  const Record &Def;
+
+public:
+  PrettyStackTraceParse(const Record &Def) : Def(Def) {}
+
+  void print(raw_ostream &OS) const override {
+    if (Def.isSubClassOf("GICombineRule"))
+      OS << "Parsing GICombineRule '" << Def.getName() << '\'';
+    else if (Def.isSubClassOf(PatFrag::ClassName))
+      OS << "Parsing " << PatFrag::ClassName << " '" << Def.getName() << '\'';
+    else
+      OS << "Parsing '" << Def.getName() << '\'';
+    OS << '\n';
+  }
+};
+} // namespace
+
+bool PatternParser::parsePatternList(
+    const DagInit &List,
+    function_ref<bool(std::unique_ptr<Pattern>)> ParseAction,
+    StringRef Operator, StringRef AnonPatNamePrefix) {
+  if (List.getOperatorAsDef(DiagLoc)->getName() != Operator) {
+    PrintError(DiagLoc, "Expected " + Operator + " operator");
+    return false;
+  }
+
+  if (List.getNumArgs() == 0) {
+    PrintError(DiagLoc, Operator + " pattern list is empty");
+    return false;
+  }
+
+  // The match section consists of a list of matchers and predicates. Parse each
+  // one and add the equivalent GIMatchDag nodes, predicates, and edges.
+  for (unsigned I = 0; I < List.getNumArgs(); ++I) {
+    Init *Arg = List.getArg(I);
+    std::string Name = List.getArgName(I)
+                           ? List.getArgName(I)->getValue().str()
+                           : ("__" + AnonPatNamePrefix + "_" + Twine(I)).str();
+
+    if (auto Pat = parseInstructionPattern(*Arg, Name)) {
+      if (!ParseAction(std::move(Pat)))
+        return false;
+      continue;
+    }
+
+    if (auto Pat = parseWipMatchOpcodeMatcher(*Arg, Name)) {
+      if (!ParseAction(std::move(Pat)))
+        return false;
+      continue;
+    }
+
+    // Parse arbitrary C++ code
+    if (const auto *StringI = dyn_cast<StringInit>(Arg)) {
+      auto CXXPat = std::make_unique<CXXPattern>(*StringI, insertStrRef(Name));
+      if (!ParseAction(std::move(CXXPat)))
+        return false;
+      continue;
+    }
+
+    PrintError(DiagLoc,
+               "Failed to parse pattern: '" + Arg->getAsString() + '\'');
+    return false;
+  }
+
+  return true;
+}
+
+static const CodeGenInstruction &
+getInstrForIntrinsic(const CodeGenTarget &CGT, const CodeGenIntrinsic *I) {
+  StringRef Opc;
+  if (I->isConvergent) {
+    Opc = I->hasSideEffects ? "G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS"
+                            : "G_INTRINSIC_CONVERGENT";
+  } else {
+    Opc = I->hasSideEffects ? "G_INTRINSIC_W_SIDE_EFFECTS" : "G_INTRINSIC";
+  }
+
+  RecordKeeper &RK = I->TheDef->getRecords();
+  return CGT.getInstruction(RK.getDef(Opc));
+}
+
+static const CodeGenIntrinsic *getCodeGenIntrinsic(Record *R) {
+  // Intrinsics need to have a static lifetime because the match table keeps
+  // references to CodeGenIntrinsic objects.
+  static DenseMap<const Record *, std::unique_ptr<CodeGenIntrinsic>>
+      AllIntrinsics;
+
+  auto &Ptr = AllIntrinsics[R];
+  if (!Ptr)
+    Ptr = std::make_unique<CodeGenIntrinsic>(R, std::vector<Record *>());
+  return Ptr.get();
+}
+
+std::unique_ptr<Pattern>
+PatternParser::parseInstructionPattern(const Init &Arg, StringRef Name) {
+  const DagInit *DagPat = dyn_cast<DagInit>(&Arg);
+  if (!DagPat)
+    return nullptr;
+
+  std::unique_ptr<InstructionPattern> Pat;
+  if (const DagInit *IP = getDagWithOperatorOfSubClass(Arg, "Instruction")) {
+    auto &Instr = CGT.getInstruction(IP->getOperatorAsDef(DiagLoc));
+    Pat =
+        std::make_unique<CodeGenInstructionPattern>(Instr, insertStrRef(Name));
+  } else if (const DagInit *IP =
+                 getDagWithOperatorOfSubClass(Arg, "Intrinsic")) {
+    Record *TheDef = IP->getOperatorAsDef(DiagLoc);
+    const CodeGenIntrinsic *Intrin = getCodeGenIntrinsic(TheDef);
+    const CodeGenInstruction &Instr = getInstrForIntrinsic(CGT, Intrin);
+    Pat =
+        std::make_unique<CodeGenInstructionPattern>(Instr, insertStrRef(Name));
+    cast<CodeGenInstructionPattern>(*Pat).setIntrinsic(Intrin);
+  } else if (const DagInit *PFP =
+                 getDagWithOperatorOfSubClass(Arg, PatFrag::ClassName)) {
+    const Record *Def = PFP->getOperatorAsDef(DiagLoc);
+    const PatFrag *PF = parsePatFrag(Def);
+    if (!PF)
+      return nullptr; // Already diagnosed by parsePatFrag
+    Pat = std::make_unique<PatFragPattern>(*PF, insertStrRef(Name));
+  } else if (const DagInit *BP =
+                 getDagWithOperatorOfSubClass(Arg, BuiltinPattern::ClassName)) {
+    Pat = std::make_unique<BuiltinPattern>(*BP->getOperatorAsDef(DiagLoc),
+                                           insertStrRef(Name));
+  } else
+    return nullptr;
+
+  for (unsigned K = 0; K < DagPat->getNumArgs(); ++K) {
+    Init *Arg = DagPat->getArg(K);
+    if (auto *DagArg = getDagWithSpecificOperator(*Arg, "MIFlags")) {
+      if (!parseInstructionPatternMIFlags(*Pat, DagArg))
+        return nullptr;
+      continue;
+    }
+
+    if (!parseInstructionPatternOperand(*Pat, Arg, DagPat->getArgName(K)))
+      return nullptr;
+  }
+
+  if (!Pat->checkSemantics(DiagLoc))
+    return nullptr;
+
+  return std::move(Pat);
+}
+
+std::unique_ptr<Pattern>
+PatternParser::parseWipMatchOpcodeMatcher(const Init &Arg, StringRef Name) {
+  const DagInit *Matcher = getDagWithSpecificOperator(Arg, "wip_match_opcode");
+  if (!Matcher)
+    return nullptr;
+
+  if (Matcher->getNumArgs() == 0) {
+    PrintError(DiagLoc, "Empty wip_match_opcode");
+    return nullptr;
+  }
+
+  // Each argument is an opcode that can match.
+  auto Result = std::make_unique<AnyOpcodePattern>(insertStrRef(Name));
+  for (const auto &Arg : Matcher->getArgs()) {
+    Record *OpcodeDef = getDefOfSubClass(*Arg, "Instruction");
+    if (OpcodeDef) {
+      Result->addOpcode(&CGT.getInstruction(OpcodeDef));
+      continue;
+    }
+
+    PrintError(DiagLoc, "Arguments to wip_match_opcode must be instructions");
+    return nullptr;
+  }
+
+  return std::move(Result);
+}
+
+bool PatternParser::parseInstructionPatternOperand(InstructionPattern &IP,
+                                                   const Init *OpInit,
+                                                   const StringInit *OpName) {
+  const auto ParseErr = [&]() {
+    PrintError(DiagLoc,
+               "cannot parse operand '" + OpInit->getAsUnquotedString() + "' ");
+    if (OpName)
+      PrintNote(DiagLoc,
+                "operand name is '" + OpName->getAsUnquotedString() + '\'');
+    return false;
+  };
+
+  // untyped immediate, e.g. 0
+  if (const auto *IntImm = dyn_cast<IntInit>(OpInit)) {
+    std::string Name = OpName ? OpName->getAsUnquotedString() : "";
+    IP.addOperand(IntImm->getValue(), insertStrRef(Name), PatternType());
+    return true;
+  }
+
+  // typed immediate, e.g. (i32 0)
+  if (const auto *DagOp = dyn_cast<DagInit>(OpInit)) {
+    if (DagOp->getNumArgs() != 1)
+      return ParseErr();
+
+    const Record *TyDef = DagOp->getOperatorAsDef(DiagLoc);
+    auto ImmTy = PatternType::get(DiagLoc, TyDef,
+                                  "cannot parse immediate '" +
+                                      DagOp->getAsUnquotedString() + '\'');
+    if (!ImmTy)
+      return false;
+
+    if (!IP.hasAllDefs()) {
+      PrintError(DiagLoc, "out operand of '" + IP.getInstName() +
+                              "' cannot be an immediate");
+      return false;
+    }
+
+    const auto *Val = dyn_cast<IntInit>(DagOp->getArg(0));
+    if (!Val)
+      return ParseErr();
+
+    std::string Name = OpName ? OpName->getAsUnquotedString() : "";
+    IP.addOperand(Val->getValue(), insertStrRef(Name), *ImmTy);
+    return true;
+  }
+
+  // Typed operand e.g. $x/$z in (G_FNEG $x, $z)
+  if (auto *DefI = dyn_cast<DefInit>(OpInit)) {
+    if (!OpName) {
+      PrintError(DiagLoc, "expected an operand name after '" +
+                              OpInit->getAsString() + '\'');
+      return false;
+    }
+    const Record *Def = DefI->getDef();
+    auto Ty = PatternType::get(DiagLoc, Def, "cannot parse operand type");
+    if (!Ty)
+      return false;
+    IP.addOperand(insertStrRef(OpName->getAsUnquotedString()), *Ty);
+    return true;
+  }
+
+  // Untyped operand e.g. $x/$z in (G_FNEG $x, $z)
+  if (isa<UnsetInit>(OpInit)) {
+    assert(OpName && "Unset w/ no OpName?");
+    IP.addOperand(insertStrRef(OpName->getAsUnquotedString()), PatternType());
+    return true;
+  }
+
+  return ParseErr();
+}
+
+bool PatternParser::parseInstructionPatternMIFlags(InstructionPattern &IP,
+                                                   const DagInit *Op) {
+  auto *CGIP = dyn_cast<CodeGenInstructionPattern>(&IP);
+  if (!CGIP) {
+    PrintError(DiagLoc,
+               "matching/writing MIFlags is only allowed on CodeGenInstruction "
+               "patterns");
+    return false;
+  }
+
+  const auto CheckFlagEnum = [&](const Record *R) {
+    if (!R->isSubClassOf(MIFlagsEnumClassName)) {
+      PrintError(DiagLoc, "'" + R->getName() + "' is not a subclass of '" +
+                              MIFlagsEnumClassName + "'");
+      return false;
+    }
+
+    return true;
+  };
+
+  if (CGIP->getMIFlagsInfo()) {
+    PrintError(DiagLoc, "MIFlags can only be present once on an instruction");
+    return false;
+  }
+
+  auto &FI = CGIP->getOrCreateMIFlagsInfo();
+  for (unsigned K = 0; K < Op->getNumArgs(); ++K) {
+    const Init *Arg = Op->getArg(K);
+
+    // Match/set a flag: (MIFlags FmNoNans)
+    if (const auto *Def = dyn_cast<DefInit>(Arg)) {
+      const Record *R = Def->getDef();
+      if (!CheckFlagEnum(R))
+        return false;
+
+      FI.addSetFlag(R);
+      continue;
+    }
+
+    // Do not match a flag/unset a flag: (MIFlags (not FmNoNans))
+    if (const DagInit *NotDag = getDagWithSpecificOperator(*Arg, "not")) {
+      for (const Init *NotArg : NotDag->getArgs()) {
+        const DefInit *DefArg = dyn_cast<DefInit>(NotArg);
+        if (!DefArg) {
+          PrintError(DiagLoc, "cannot parse '" + NotArg->getAsUnquotedString() +
+                                  "': expected a '" + MIFlagsEnumClassName +
+                                  "'");
+          return false;
+        }
+
+        const Record *R = DefArg->getDef();
+        if (!CheckFlagEnum(R))
+          return false;
+
+        FI.addUnsetFlag(R);
+        continue;
+      }
+
+      continue;
+    }
+
+    // Copy flags from a matched instruction: (MIFlags $mi)
+    if (isa<UnsetInit>(Arg)) {
+      FI.addCopyFlag(insertStrRef(Op->getArgName(K)->getAsUnquotedString()));
+      continue;
+    }
+  }
+
+  return true;
+}
+
+std::unique_ptr<PatFrag> PatternParser::parsePatFragImpl(const Record *Def) {
+  auto StackTrace = PrettyStackTraceParse(*Def);
+  if (!Def->isSubClassOf(PatFrag::ClassName))
+    return nullptr;
+
+  const DagInit *Ins = Def->getValueAsDag("InOperands");
+  if (Ins->getOperatorAsDef(Def->getLoc())->getName() != "ins") {
+    PrintError(Def, "expected 'ins' operator for " + PatFrag::ClassName +
+                        " in operands list");
+    return nullptr;
+  }
+
+  const DagInit *Outs = Def->getValueAsDag("OutOperands");
+  if (Outs->getOperatorAsDef(Def->getLoc())->getName() != "outs") {
+    PrintError(Def, "expected 'outs' operator for " + PatFrag::ClassName +
+                        " out operands list");
+    return nullptr;
+  }
+
+  auto Result = std::make_unique<PatFrag>(*Def);
+  if (!parsePatFragParamList(*Outs, [&](StringRef Name, unsigned Kind) {
+        Result->addOutParam(insertStrRef(Name), (PatFrag::ParamKind)Kind);
+        return true;
+      }))
+    return nullptr;
+
+  if (!parsePatFragParamList(*Ins, [&](StringRef Name, unsigned Kind) {
+        Result->addInParam(insertStrRef(Name), (PatFrag::ParamKind)Kind);
+        return true;
+      }))
+    return nullptr;
+
+  const ListInit *Alts = Def->getValueAsListInit("Alternatives");
+  unsigned AltIdx = 0;
+  for (const Init *Alt : *Alts) {
+    const auto *PatDag = dyn_cast<DagInit>(Alt);
+    if (!PatDag) {
+      PrintError(Def, "expected dag init for PatFrag pattern alternative");
+      return nullptr;
+    }
+
+    PatFrag::Alternative &A = Result->addAlternative();
+    const auto AddPat = [&](std::unique_ptr<Pattern> Pat) {
+      A.Pats.push_back(std::move(Pat));
+      return true;
+    };
+
+    SaveAndRestore<ArrayRef<SMLoc>> DiagLocSAR(DiagLoc, Def->getLoc());
+    if (!parsePatternList(
+            *PatDag, AddPat, "pattern",
+            /*AnonPatPrefix*/
+            (Def->getName() + "_alt" + Twine(AltIdx++) + "_pattern").str()))
+      return nullptr;
+  }
+
+  if (!Result->buildOperandsTables() || !Result->checkSemantics())
+    return nullptr;
+
+  return Result;
+}
+
+bool PatternParser::parsePatFragParamList(
+    const DagInit &OpsList,
+    function_ref<bool(StringRef, unsigned)> ParseAction) {
+  for (unsigned K = 0; K < OpsList.getNumArgs(); ++K) {
+    const StringInit *Name = OpsList.getArgName(K);
+    const Init *Ty = OpsList.getArg(K);
+
+    if (!Name) {
+      PrintError(DiagLoc, "all operands must be named'");
+      return false;
+    }
+    const std::string NameStr = Name->getAsUnquotedString();
+
+    PatFrag::ParamKind OpKind;
+    if (isSpecificDef(*Ty, "gi_imm"))
+      OpKind = PatFrag::PK_Imm;
+    else if (isSpecificDef(*Ty, "root"))
+      OpKind = PatFrag::PK_Root;
+    else if (isa<UnsetInit>(Ty) ||
+             isSpecificDef(*Ty, "gi_mo")) // no type = gi_mo.
+      OpKind = PatFrag::PK_MachineOperand;
+    else {
+      PrintError(
+          DiagLoc,
+          '\'' + NameStr +
+              "' operand type was expected to be 'root', 'gi_imm' or 'gi_mo'");
+      return false;
+    }
+
+    if (!ParseAction(NameStr, (unsigned)OpKind))
+      return false;
+  }
+
+  return true;
+}
+
+const PatFrag *PatternParser::parsePatFrag(const Record *Def) {
+  // Cache already parsed PatFrags to avoid doing extra work.
+  static DenseMap<const Record *, std::unique_ptr<PatFrag>> ParsedPatFrags;
+
+  auto It = ParsedPatFrags.find(Def);
+  if (It != ParsedPatFrags.end()) {
+    SeenPatFrags.insert(It->second.get());
+    return It->second.get();
+  }
+
+  std::unique_ptr<PatFrag> NewPatFrag = parsePatFragImpl(Def);
+  if (!NewPatFrag) {
+    PrintError(Def, "Could not parse " + PatFrag::ClassName + " '" +
+                        Def->getName() + "'");
+    // Put a nullptr in the map so we don't attempt parsing this again.
+    ParsedPatFrags[Def] = nullptr;
+    return nullptr;
+  }
+
+  const auto *Res = NewPatFrag.get();
+  ParsedPatFrags[Def] = std::move(NewPatFrag);
+  SeenPatFrags.insert(Res);
+  return Res;
+}
+
+} // namespace gi
+} // namespace llvm
diff --git a/llvm/utils/TableGen/Common/GlobalISel/PatternParser.h b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.h
new file mode 100644
index 00000000000000..cd6f524075cdb9
--- /dev/null
+++ b/llvm/utils/TableGen/Common/GlobalISel/PatternParser.h
@@ -0,0 +1,118 @@
+//===- PatternParser.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Contains tools to parse MIR patterns from TableGen DAG elements.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_GLOBALISEL_PATTERNPARSER_H
+#define LLVM_UTILS_GLOBALISEL_PATTERNPARSER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/SMLoc.h"
+#include <memory>
+
+namespace llvm {
+class CodeGenTarget;
+class DagInit;
+class Init;
+class Record;
+class StringRef;
+class StringInit;
+
+namespace gi {
+class InstructionPattern;
+class Pattern;
+class PatFrag;
+
+/// Helper class to parse MIR Pattern lists.
+///
+/// e.g., `(match (G_FADD $x, $y, $z), (G_FNEG $y, $z))`
+class PatternParser {
+  const CodeGenTarget &CGT;
+  ArrayRef<SMLoc> DiagLoc;
+
+  mutable SmallPtrSet<const PatFrag *, 2> SeenPatFrags;
+
+public:
+  PatternParser(const CodeGenTarget &CGT, ArrayRef<SMLoc> DiagLoc)
+      : CGT(CGT), DiagLoc(DiagLoc) {}
+
+  /// Parses a list of patterns such as:
+  ///   (Operator (Pattern1 ...), (Pattern2 ...))
+  /// \param List         DagInit of the expected pattern list.
+  /// \param ParseAction  Callback to handle a succesfully parsed pattern.
+  /// \param Operator     The name of the operator, e.g. "match"
+  /// \param AnonPatNamePrefix Prefix for anonymous pattern names.
+  /// \return true on success, false on failure.
+  bool
+  parsePatternList(const DagInit &List,
+                   function_ref<bool(std::unique_ptr<Pattern>)> ParseAction,
+                   StringRef Operator, StringRef AnonPatNamePrefix);
+
+  /// \returns all PatFrags encountered by this PatternParser.
+  const auto &getSeenPatFrags() const { return SeenPatFrags; }
+
+private:
+  /// Parse any InstructionPattern from a TableGen Init.
+  /// \param Arg Init to parse.
+  /// \param PatName Name of the pattern that will be parsed.
+  /// \return the parsed pattern on success, nullptr on failure.
+  std::unique_ptr<Pattern> parseInstructionPattern(const Init &Arg,
+                                                   StringRef PatName);
+
+  /// Parse a WipOpcodeMatcher from a TableGen Init.
+  /// \param Arg Init to parse.
+  /// \param PatName Name of the pattern that will be parsed.
+  /// \return the parsed pattern on success, nullptr on failure.
+  std::unique_ptr<Pattern> parseWipMatchOpcodeMatcher(const Init &Arg,
+                                                      StringRef PatName);
+
+  /// Parses an Operand of an InstructionPattern from a TableGen Init.
+  /// \param IP InstructionPattern for which we're parsing.
+  /// \param OpInit Init to parse.
+  /// \param OpName Name of the operand to parse.
+  /// \return true on success, false on failure.
+  bool parseInstructionPatternOperand(InstructionPattern &IP,
+                                      const Init *OpInit,
+                                      const StringInit *OpName);
+
+  /// Parses a MIFlag for an InstructionPattern from a TableGen Init.
+  /// \param IP InstructionPattern for which we're parsing.
+  /// \param Op Init to parse.
+  /// \return true on success, false on failure.
+  bool parseInstructionPatternMIFlags(InstructionPattern &IP,
+                                      const DagInit *Op);
+
+  /// (Uncached) PatFrag parsing implementation.
+  /// \param Def PatFrag def to parsee.
+  /// \return the parsed PatFrag on success, nullptr on failure.
+  std::unique_ptr<PatFrag> parsePatFragImpl(const Record *Def);
+
+  /// Parses the in or out parameter list of a PatFrag.
+  /// \param OpsList Init to parse.
+  /// \param ParseAction Callback on successful parse, with the name of
+  ///                     the parameter and its \ref PatFrag::ParamKind
+  /// \return true on success, false on failure.
+  bool
+  parsePatFragParamList(const DagInit &OpsList,
+                        function_ref<bool(StringRef, unsigned)> ParseAction);
+
+  /// Cached PatFrag parser. This avoids duplicate work by keeping track of
+  /// already-parsed PatFrags.
+  /// \param Def PatFrag def to parsee.
+  /// \return the parsed PatFrag on success, nullptr on failure.
+  const PatFrag *parsePatFrag(const Record *Def);
+};
+
+} // namespace gi
+} // namespace llvm
+
+#endif
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index 39b9f8a2ae1764..1ae6efd4a7d0a4 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -36,6 +36,7 @@
 #include "Common/GlobalISel/GlobalISelMatchTable.h"
 #include "Common/GlobalISel/GlobalISelMatchTableExecutorEmitter.h"
 #include "Common/GlobalISel/MatchDataInfo.h"
+#include "Common/GlobalISel/PatternParser.h"
 #include "Common/GlobalISel/Patterns.h"
 #include "Common/SubtargetFeatureInfo.h"
 #include "llvm/ADT/APInt.h"
@@ -80,7 +81,6 @@ cl::opt<bool> DebugTypeInfer("gicombiner-debug-typeinfer",
 
 constexpr StringLiteral CXXApplyPrefix = "GICXXCustomAction_CombineApply";
 constexpr StringLiteral CXXPredPrefix = "GICXXPred_MI_Predicate_";
-constexpr StringLiteral MIFlagsEnumClassName = "MIFlagEnum";
 
 //===- CodeExpansions Helpers  --------------------------------------------===//
 
@@ -109,17 +109,6 @@ void declareTempRegExpansion(CodeExpansions &CE, unsigned TempRegID,
 
 //===- Misc. Helpers  -----------------------------------------------------===//
 
-/// Copies a StringRef into a static pool to preserve it.
-/// Most Pattern classes use StringRef so we need this.
-StringRef insertStrRef(StringRef S) {
-  if (S.empty())
-    return {};
-
-  static StringSet<> Pool;
-  auto [It, Inserted] = Pool.insert(S);
-  return It->getKey();
-}
-
 template <typename Container> auto keys(Container &&C) {
   return map_range(C, [](auto &Entry) -> auto & { return Entry.first; });
 }
@@ -639,8 +628,9 @@ class CombineRuleBuilder {
                      SubtargetFeatureInfoMap &SubtargetFeatures,
                      Record &RuleDef, unsigned ID,
                      std::vector<RuleMatcher> &OutRMs)
-      : CGT(CGT), SubtargetFeatures(SubtargetFeatures), RuleDef(RuleDef),
-        RuleID(ID), OutRMs(OutRMs) {}
+      : Parser(CGT, RuleDef.getLoc()), CGT(CGT),
+        SubtargetFeatures(SubtargetFeatures), RuleDef(RuleDef), RuleID(ID),
+        OutRMs(OutRMs) {}
 
   /// Parses all fields in the RuleDef record.
   bool parseAll();
@@ -718,26 +708,6 @@ class CombineRuleBuilder {
   bool buildRuleOperandsTable();
 
   bool parseDefs(const DagInit &Def);
-  bool
-  parsePatternList(const DagInit &List,
-                   function_ref<bool(std::unique_ptr<Pattern>)> ParseAction,
-                   StringRef Operator, ArrayRef<SMLoc> DiagLoc,
-                   StringRef AnonPatNamePrefix) const;
-
-  std::unique_ptr<Pattern> parseInstructionPattern(const Init &Arg,
-                                                   StringRef PatName) const;
-  std::unique_ptr<Pattern> parseWipMatchOpcodeMatcher(const Init &Arg,
-                                                      StringRef PatName) const;
-  bool parseInstructionPatternOperand(InstructionPattern &IP,
-                                      const Init *OpInit,
-                                      const StringInit *OpName) const;
-  bool parseInstructionPatternMIFlags(InstructionPattern &IP,
-                                      const DagInit *Op) const;
-  std::unique_ptr<PatFrag> parsePatFragImpl(const Record *Def) const;
-  bool parsePatFragParamList(
-      ArrayRef<SMLoc> DiagLoc, const DagInit &OpsList,
-      function_ref<bool(StringRef, PatFrag::ParamKind)> ParseAction) const;
-  const PatFrag *parsePatFrag(const Record *Def) const;
 
   bool emitMatchPattern(CodeExpansions &CE, const PatternAlternatives &Alts,
                         const InstructionPattern &IP);
@@ -781,6 +751,7 @@ class CombineRuleBuilder {
       DenseSet<const Pattern *> &SeenPats, OperandDefLookupFn LookupOperandDef,
       OperandMapperFnRef OperandMapper = [](const auto &O) { return O; });
 
+  PatternParser Parser;
   const CodeGenTarget &CGT;
   SubtargetFeatureInfoMap &SubtargetFeatures;
   Record &RuleDef;
@@ -808,9 +779,6 @@ class CombineRuleBuilder {
 
   SmallVector<MatchDataInfo, 2> MatchDatas;
   SmallVector<PatternAlternatives, 1> PermutationsToEmit;
-
-  // print()/debug-only members.
-  mutable SmallPtrSet<const PatFrag *, 2> SeenPatFrags;
 };
 
 bool CombineRuleBuilder::parseAll() {
@@ -819,16 +787,16 @@ bool CombineRuleBuilder::parseAll() {
   if (!parseDefs(*RuleDef.getValueAsDag("Defs")))
     return false;
 
-  if (!parsePatternList(
+  if (!Parser.parsePatternList(
           *RuleDef.getValueAsDag("Match"),
           [this](auto Pat) { return addMatchPattern(std::move(Pat)); }, "match",
-          RuleDef.getLoc(), (RuleDef.getName() + "_match").str()))
+          (RuleDef.getName() + "_match").str()))
     return false;
 
-  if (!parsePatternList(
+  if (!Parser.parsePatternList(
           *RuleDef.getValueAsDag("Apply"),
           [this](auto Pat) { return addApplyPattern(std::move(Pat)); }, "apply",
-          RuleDef.getLoc(), (RuleDef.getName() + "_apply").str()))
+          (RuleDef.getName() + "_apply").str()))
     return false;
 
   if (!buildRuleOperandsTable() || !typecheckPatterns() || !findRoots() ||
@@ -884,9 +852,10 @@ void CombineRuleBuilder::print(raw_ostream &OS) const {
     OS << "  )\n";
   }
 
-  if (!SeenPatFrags.empty()) {
+  const auto &SeenPFs = Parser.getSeenPatFrags();
+  if (!SeenPFs.empty()) {
     OS << "  (PatFrags\n";
-    for (const auto *PF : SeenPatFrags) {
+    for (const auto *PF : Parser.getSeenPatFrags()) {
       PF->print(OS, /*Indent=*/"    ");
       OS << '\n';
     }
@@ -1500,426 +1469,6 @@ bool CombineRuleBuilder::parseDefs(const DagInit &Def) {
   return true;
 }
 
-bool CombineRuleBuilder::parsePatternList(
-    const DagInit &List,
-    function_ref<bool(std::unique_ptr<Pattern>)> ParseAction,
-    StringRef Operator, ArrayRef<SMLoc> DiagLoc,
-    StringRef AnonPatNamePrefix) const {
-  if (List.getOperatorAsDef(RuleDef.getLoc())->getName() != Operator) {
-    ::PrintError(DiagLoc, "Expected " + Operator + " operator");
-    return false;
-  }
-
-  if (List.getNumArgs() == 0) {
-    ::PrintError(DiagLoc, Operator + " pattern list is empty");
-    return false;
-  }
-
-  // The match section consists of a list of matchers and predicates. Parse each
-  // one and add the equivalent GIMatchDag nodes, predicates, and edges.
-  for (unsigned I = 0; I < List.getNumArgs(); ++I) {
-    Init *Arg = List.getArg(I);
-    std::string Name = List.getArgName(I)
-                           ? List.getArgName(I)->getValue().str()
-                           : ("__" + AnonPatNamePrefix + "_" + Twine(I)).str();
-
-    if (auto Pat = parseInstructionPattern(*Arg, Name)) {
-      if (!ParseAction(std::move(Pat)))
-        return false;
-      continue;
-    }
-
-    if (auto Pat = parseWipMatchOpcodeMatcher(*Arg, Name)) {
-      if (!ParseAction(std::move(Pat)))
-        return false;
-      continue;
-    }
-
-    // Parse arbitrary C++ code
-    if (const auto *StringI = dyn_cast<StringInit>(Arg)) {
-      auto CXXPat = std::make_unique<CXXPattern>(*StringI, insertStrRef(Name));
-      if (!ParseAction(std::move(CXXPat)))
-        return false;
-      continue;
-    }
-
-    ::PrintError(DiagLoc,
-                 "Failed to parse pattern: '" + Arg->getAsString() + "'");
-    return false;
-  }
-
-  return true;
-}
-
-static const CodeGenInstruction &
-getInstrForIntrinsic(const CodeGenTarget &CGT, const CodeGenIntrinsic *I) {
-  StringRef Opc;
-  if (I->isConvergent) {
-    Opc = I->hasSideEffects ? "G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS"
-                            : "G_INTRINSIC_CONVERGENT";
-  } else {
-    Opc = I->hasSideEffects ? "G_INTRINSIC_W_SIDE_EFFECTS" : "G_INTRINSIC";
-  }
-
-  RecordKeeper &RK = I->TheDef->getRecords();
-  return CGT.getInstruction(RK.getDef(Opc));
-}
-
-static const CodeGenIntrinsic *getCodeGenIntrinsic(Record *R) {
-  // Intrinsics need to have a static lifetime because the match table keeps
-  // references to CodeGenIntrinsic objects.
-  static DenseMap<const Record *, std::unique_ptr<CodeGenIntrinsic>>
-      AllIntrinsics;
-
-  auto &Ptr = AllIntrinsics[R];
-  if (!Ptr)
-    Ptr = std::make_unique<CodeGenIntrinsic>(R, std::vector<Record *>());
-  return Ptr.get();
-}
-
-std::unique_ptr<Pattern>
-CombineRuleBuilder::parseInstructionPattern(const Init &Arg,
-                                            StringRef Name) const {
-  const DagInit *DagPat = dyn_cast<DagInit>(&Arg);
-  if (!DagPat)
-    return nullptr;
-
-  std::unique_ptr<InstructionPattern> Pat;
-  if (const DagInit *IP = getDagWithOperatorOfSubClass(Arg, "Instruction")) {
-    auto &Instr = CGT.getInstruction(IP->getOperatorAsDef(RuleDef.getLoc()));
-    Pat =
-        std::make_unique<CodeGenInstructionPattern>(Instr, insertStrRef(Name));
-  } else if (const DagInit *IP =
-                 getDagWithOperatorOfSubClass(Arg, "Intrinsic")) {
-    Record *TheDef = IP->getOperatorAsDef(RuleDef.getLoc());
-    const CodeGenIntrinsic *Intrin = getCodeGenIntrinsic(TheDef);
-    const CodeGenInstruction &Instr = getInstrForIntrinsic(CGT, Intrin);
-    Pat =
-        std::make_unique<CodeGenInstructionPattern>(Instr, insertStrRef(Name));
-    cast<CodeGenInstructionPattern>(*Pat).setIntrinsic(Intrin);
-  } else if (const DagInit *PFP =
-                 getDagWithOperatorOfSubClass(Arg, PatFrag::ClassName)) {
-    const Record *Def = PFP->getOperatorAsDef(RuleDef.getLoc());
-    const PatFrag *PF = parsePatFrag(Def);
-    if (!PF)
-      return nullptr; // Already diagnosed by parsePatFrag
-    Pat = std::make_unique<PatFragPattern>(*PF, insertStrRef(Name));
-  } else if (const DagInit *BP =
-                 getDagWithOperatorOfSubClass(Arg, BuiltinPattern::ClassName)) {
-    Pat = std::make_unique<BuiltinPattern>(
-        *BP->getOperatorAsDef(RuleDef.getLoc()), insertStrRef(Name));
-  } else
-    return nullptr;
-
-  for (unsigned K = 0; K < DagPat->getNumArgs(); ++K) {
-    Init *Arg = DagPat->getArg(K);
-    if (auto *DagArg = getDagWithSpecificOperator(*Arg, "MIFlags")) {
-      if (!parseInstructionPatternMIFlags(*Pat, DagArg))
-        return nullptr;
-      continue;
-    }
-
-    if (!parseInstructionPatternOperand(*Pat, Arg, DagPat->getArgName(K)))
-      return nullptr;
-  }
-
-  if (!Pat->checkSemantics(RuleDef.getLoc()))
-    return nullptr;
-
-  return std::move(Pat);
-}
-
-std::unique_ptr<Pattern>
-CombineRuleBuilder::parseWipMatchOpcodeMatcher(const Init &Arg,
-                                               StringRef Name) const {
-  const DagInit *Matcher = getDagWithSpecificOperator(Arg, "wip_match_opcode");
-  if (!Matcher)
-    return nullptr;
-
-  if (Matcher->getNumArgs() == 0) {
-    PrintError("Empty wip_match_opcode");
-    return nullptr;
-  }
-
-  // Each argument is an opcode that can match.
-  auto Result = std::make_unique<AnyOpcodePattern>(insertStrRef(Name));
-  for (const auto &Arg : Matcher->getArgs()) {
-    Record *OpcodeDef = getDefOfSubClass(*Arg, "Instruction");
-    if (OpcodeDef) {
-      Result->addOpcode(&CGT.getInstruction(OpcodeDef));
-      continue;
-    }
-
-    PrintError("Arguments to wip_match_opcode must be instructions");
-    return nullptr;
-  }
-
-  return std::move(Result);
-}
-
-bool CombineRuleBuilder::parseInstructionPatternOperand(
-    InstructionPattern &IP, const Init *OpInit,
-    const StringInit *OpName) const {
-  const auto ParseErr = [&]() {
-    PrintError("cannot parse operand '" + OpInit->getAsUnquotedString() + "' ");
-    if (OpName)
-      PrintNote("operand name is '" + OpName->getAsUnquotedString() + "'");
-    return false;
-  };
-
-  // untyped immediate, e.g. 0
-  if (const auto *IntImm = dyn_cast<IntInit>(OpInit)) {
-    std::string Name = OpName ? OpName->getAsUnquotedString() : "";
-    IP.addOperand(IntImm->getValue(), insertStrRef(Name), PatternType());
-    return true;
-  }
-
-  // typed immediate, e.g. (i32 0)
-  if (const auto *DagOp = dyn_cast<DagInit>(OpInit)) {
-    if (DagOp->getNumArgs() != 1)
-      return ParseErr();
-
-    const Record *TyDef = DagOp->getOperatorAsDef(RuleDef.getLoc());
-    auto ImmTy = PatternType::get(RuleDef.getLoc(), TyDef,
-                                  "cannot parse immediate '" +
-                                      DagOp->getAsUnquotedString() + "'");
-    if (!ImmTy)
-      return false;
-
-    if (!IP.hasAllDefs()) {
-      PrintError("out operand of '" + IP.getInstName() +
-                 "' cannot be an immediate");
-      return false;
-    }
-
-    const auto *Val = dyn_cast<IntInit>(DagOp->getArg(0));
-    if (!Val)
-      return ParseErr();
-
-    std::string Name = OpName ? OpName->getAsUnquotedString() : "";
-    IP.addOperand(Val->getValue(), insertStrRef(Name), *ImmTy);
-    return true;
-  }
-
-  // Typed operand e.g. $x/$z in (G_FNEG $x, $z)
-  if (auto *DefI = dyn_cast<DefInit>(OpInit)) {
-    if (!OpName) {
-      PrintError("expected an operand name after '" + OpInit->getAsString() +
-                 "'");
-      return false;
-    }
-    const Record *Def = DefI->getDef();
-    auto Ty =
-        PatternType::get(RuleDef.getLoc(), Def, "cannot parse operand type");
-    if (!Ty)
-      return false;
-    IP.addOperand(insertStrRef(OpName->getAsUnquotedString()), *Ty);
-    return true;
-  }
-
-  // Untyped operand e.g. $x/$z in (G_FNEG $x, $z)
-  if (isa<UnsetInit>(OpInit)) {
-    assert(OpName && "Unset w/ no OpName?");
-    IP.addOperand(insertStrRef(OpName->getAsUnquotedString()), PatternType());
-    return true;
-  }
-
-  return ParseErr();
-}
-
-bool CombineRuleBuilder::parseInstructionPatternMIFlags(
-    InstructionPattern &IP, const DagInit *Op) const {
-  auto *CGIP = dyn_cast<CodeGenInstructionPattern>(&IP);
-  if (!CGIP) {
-    PrintError("matching/writing MIFlags is only allowed on CodeGenInstruction "
-               "patterns");
-    return false;
-  }
-
-  const auto CheckFlagEnum = [&](const Record *R) {
-    if (!R->isSubClassOf(MIFlagsEnumClassName)) {
-      PrintError("'" + R->getName() + "' is not a subclass of '" +
-                 MIFlagsEnumClassName + "'");
-      return false;
-    }
-
-    return true;
-  };
-
-  if (CGIP->getMIFlagsInfo()) {
-    PrintError("MIFlags can only be present once on an instruction");
-    return false;
-  }
-
-  auto &FI = CGIP->getOrCreateMIFlagsInfo();
-  for (unsigned K = 0; K < Op->getNumArgs(); ++K) {
-    const Init *Arg = Op->getArg(K);
-
-    // Match/set a flag: (MIFlags FmNoNans)
-    if (const auto *Def = dyn_cast<DefInit>(Arg)) {
-      const Record *R = Def->getDef();
-      if (!CheckFlagEnum(R))
-        return false;
-
-      FI.addSetFlag(R);
-      continue;
-    }
-
-    // Do not match a flag/unset a flag: (MIFlags (not FmNoNans))
-    if (const DagInit *NotDag = getDagWithSpecificOperator(*Arg, "not")) {
-      for (const Init *NotArg : NotDag->getArgs()) {
-        const DefInit *DefArg = dyn_cast<DefInit>(NotArg);
-        if (!DefArg) {
-          PrintError("cannot parse '" + NotArg->getAsUnquotedString() +
-                     "': expected a '" + MIFlagsEnumClassName + "'");
-          return false;
-        }
-
-        const Record *R = DefArg->getDef();
-        if (!CheckFlagEnum(R))
-          return false;
-
-        FI.addUnsetFlag(R);
-        continue;
-      }
-
-      continue;
-    }
-
-    // Copy flags from a matched instruction: (MIFlags $mi)
-    if (isa<UnsetInit>(Arg)) {
-      FI.addCopyFlag(insertStrRef(Op->getArgName(K)->getAsUnquotedString()));
-      continue;
-    }
-  }
-
-  return true;
-}
-
-std::unique_ptr<PatFrag>
-CombineRuleBuilder::parsePatFragImpl(const Record *Def) const {
-  auto StackTrace = PrettyStackTraceParse(*Def);
-  if (!Def->isSubClassOf(PatFrag::ClassName))
-    return nullptr;
-
-  const DagInit *Ins = Def->getValueAsDag("InOperands");
-  if (Ins->getOperatorAsDef(Def->getLoc())->getName() != "ins") {
-    ::PrintError(Def, "expected 'ins' operator for " + PatFrag::ClassName +
-                          " in operands list");
-    return nullptr;
-  }
-
-  const DagInit *Outs = Def->getValueAsDag("OutOperands");
-  if (Outs->getOperatorAsDef(Def->getLoc())->getName() != "outs") {
-    ::PrintError(Def, "expected 'outs' operator for " + PatFrag::ClassName +
-                          " out operands list");
-    return nullptr;
-  }
-
-  auto Result = std::make_unique<PatFrag>(*Def);
-  if (!parsePatFragParamList(Def->getLoc(), *Outs,
-                             [&](StringRef Name, PatFrag::ParamKind Kind) {
-                               Result->addOutParam(insertStrRef(Name), Kind);
-                               return true;
-                             }))
-    return nullptr;
-
-  if (!parsePatFragParamList(Def->getLoc(), *Ins,
-                             [&](StringRef Name, PatFrag::ParamKind Kind) {
-                               Result->addInParam(insertStrRef(Name), Kind);
-                               return true;
-                             }))
-    return nullptr;
-
-  const ListInit *Alts = Def->getValueAsListInit("Alternatives");
-  unsigned AltIdx = 0;
-  for (const Init *Alt : *Alts) {
-    const auto *PatDag = dyn_cast<DagInit>(Alt);
-    if (!PatDag) {
-      ::PrintError(Def, "expected dag init for PatFrag pattern alternative");
-      return nullptr;
-    }
-
-    PatFrag::Alternative &A = Result->addAlternative();
-    const auto AddPat = [&](std::unique_ptr<Pattern> Pat) {
-      A.Pats.push_back(std::move(Pat));
-      return true;
-    };
-
-    if (!parsePatternList(
-            *PatDag, AddPat, "pattern", Def->getLoc(),
-            /*AnonPatPrefix*/
-            (Def->getName() + "_alt" + Twine(AltIdx++) + "_pattern").str()))
-      return nullptr;
-  }
-
-  if (!Result->buildOperandsTables() || !Result->checkSemantics())
-    return nullptr;
-
-  return Result;
-}
-
-bool CombineRuleBuilder::parsePatFragParamList(
-    ArrayRef<SMLoc> DiagLoc, const DagInit &OpsList,
-    function_ref<bool(StringRef, PatFrag::ParamKind)> ParseAction) const {
-  for (unsigned K = 0; K < OpsList.getNumArgs(); ++K) {
-    const StringInit *Name = OpsList.getArgName(K);
-    const Init *Ty = OpsList.getArg(K);
-
-    if (!Name) {
-      ::PrintError(DiagLoc, "all operands must be named'");
-      return false;
-    }
-    const std::string NameStr = Name->getAsUnquotedString();
-
-    PatFrag::ParamKind OpKind;
-    if (isSpecificDef(*Ty, "gi_imm"))
-      OpKind = PatFrag::PK_Imm;
-    else if (isSpecificDef(*Ty, "root"))
-      OpKind = PatFrag::PK_Root;
-    else if (isa<UnsetInit>(Ty) ||
-             isSpecificDef(*Ty, "gi_mo")) // no type = gi_mo.
-      OpKind = PatFrag::PK_MachineOperand;
-    else {
-      ::PrintError(
-          DiagLoc,
-          "'" + NameStr +
-              "' operand type was expected to be 'root', 'gi_imm' or 'gi_mo'");
-      return false;
-    }
-
-    if (!ParseAction(NameStr, OpKind))
-      return false;
-  }
-
-  return true;
-}
-
-const PatFrag *CombineRuleBuilder::parsePatFrag(const Record *Def) const {
-  // Cache already parsed PatFrags to avoid doing extra work.
-  static DenseMap<const Record *, std::unique_ptr<PatFrag>> ParsedPatFrags;
-
-  auto It = ParsedPatFrags.find(Def);
-  if (It != ParsedPatFrags.end()) {
-    SeenPatFrags.insert(It->second.get());
-    return It->second.get();
-  }
-
-  std::unique_ptr<PatFrag> NewPatFrag = parsePatFragImpl(Def);
-  if (!NewPatFrag) {
-    ::PrintError(Def, "Could not parse " + PatFrag::ClassName + " '" +
-                          Def->getName() + "'");
-    // Put a nullptr in the map so we don't attempt parsing this again.
-    ParsedPatFrags[Def] = nullptr;
-    return nullptr;
-  }
-
-  const auto *Res = NewPatFrag.get();
-  ParsedPatFrags[Def] = std::move(NewPatFrag);
-  SeenPatFrags.insert(Res);
-  return Res;
-}
-
 bool CombineRuleBuilder::emitMatchPattern(CodeExpansions &CE,
                                           const PatternAlternatives &Alts,
                                           const InstructionPattern &IP) {
@@ -2956,8 +2505,8 @@ GICombinerEmitter::buildMatchTable(MutableArrayRef<RuleMatcher> Rules) {
                                                const Matcher *B) {
     auto *L = static_cast<const RuleMatcher *>(A);
     auto *R = static_cast<const RuleMatcher *>(B);
-    return std::tuple(OpcodeOrder[L->getOpcode()], L->getNumOperands()) <
-           std::tuple(OpcodeOrder[R->getOpcode()], R->getNumOperands());
+    return std::make_tuple(OpcodeOrder[L->getOpcode()], L->getNumOperands()) <
+           std::make_tuple(OpcodeOrder[R->getOpcode()], R->getNumOperands());
   });
 
   for (Matcher *Rule : InputRules)

>From 26464f2662d13c7c6ef9f8180b1653c046cd60a7 Mon Sep 17 00:00:00 2001
From: Justin Cady <desk at justincady.com>
Date: Wed, 27 Mar 2024 09:03:46 -0400
Subject: [PATCH 20/54] [FreeBSD] Mark __stack_chk_guard dso_local except for
 PPC64 (#86665)

Adjust logic of 1cb9f37a17ab to match freebsd/freebsd-src at 9a4d48a645a7a.

D113443 is the original attempt to bring this FreeBSD patch to
llvm-project,
but it never landed. This change is required to build FreeBSD kernel
modules
with -fstack-protector using a standard LLVM toolchain. The FreeBSD
kernel
loader does not handle R_X86_64_REX_GOTPCRELX relocations.

Fixes #50932.
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp  | 3 ++-
 llvm/test/CodeGen/X86/stack-protector.ll | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9990556f89ed8b..b16e78daf58614 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2073,7 +2073,8 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
     // FreeBSD has "__stack_chk_guard" defined externally on libc.so
     if (M.getDirectAccessExternalData() &&
         !TM.getTargetTriple().isWindowsGNUEnvironment() &&
-        !TM.getTargetTriple().isOSFreeBSD() &&
+        !(TM.getTargetTriple().isPPC64() &&
+          TM.getTargetTriple().isOSFreeBSD()) &&
         (!TM.getTargetTriple().isOSDarwin() ||
          TM.getRelocationModel() == Reloc::Static))
       GV->setDSOLocal(true);
diff --git a/llvm/test/CodeGen/X86/stack-protector.ll b/llvm/test/CodeGen/X86/stack-protector.ll
index a277f9f862ab26..f4f3ae4f55f2ee 100644
--- a/llvm/test/CodeGen/X86/stack-protector.ll
+++ b/llvm/test/CodeGen/X86/stack-protector.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
 ; RUN: llc -code-model=kernel -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-KERNEL-X64 %s
+; RUN: llc -code-model=kernel -mtriple=x86_64-unknown-freebsd < %s -o - | FileCheck --check-prefix=FREEBSD-KERNEL-X64 %s
 ; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | FileCheck --check-prefix=DARWIN-X64 %s
 ; RUN: llc -mtriple=amd64-pc-openbsd < %s -o - | FileCheck --check-prefix=OPENBSD-AMD64 %s
 ; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-I386 %s
@@ -75,6 +76,10 @@ entry:
 ; LINUX-X64: mov{{l|q}} %fs:
 ; LINUX-X64: callq __stack_chk_fail
 
+; FREEBSD-KERNEL-X64-LABEL: test1b:
+; FREEBSD-KERNEL-X64-NOT: mov{{l|q}} __stack_chk_guard at GOTPCREL
+; FREEBSD-KERNEL-X64: callq __stack_chk_fail
+
 ; LINUX-KERNEL-X64-LABEL: test1b:
 ; LINUX-KERNEL-X64: mov{{l|q}} %gs:
 ; LINUX-KERNEL-X64: callq __stack_chk_fail
@@ -118,6 +123,10 @@ entry:
 ; LINUX-X64: mov{{l|q}} %fs:
 ; LINUX-X64: callq __stack_chk_fail
 
+; FREEBSD-KERNEL-X64-LABEL: test1c:
+; FREEBSD-KERNEL-X64: mov{{l|q}} __stack_chk_guard(%rip)
+; FREEBSD-KERNEL-X64: callq __stack_chk_fail
+
 ; LINUX-KERNEL-X64-LABEL: test1c:
 ; LINUX-KERNEL-X64: mov{{l|q}} %gs:
 ; LINUX-KERNEL-X64: callq __stack_chk_fail

>From 932949dbb517b089af28fdc480a16a738ee5db78 Mon Sep 17 00:00:00 2001
From: Egor Zhdan <e_zhdan at apple.com>
Date: Wed, 27 Mar 2024 13:13:06 +0000
Subject: [PATCH 21/54] [APINotes] Upstream the remaining API Notes fixes and
 tests

This upstreams the last bits of Clang API Notes functionality that is
currently implemented in the Apple fork:
https://github.com/apple/llvm-project/tree/next/clang/lib/APINotes
---
 clang/lib/Sema/SemaAPINotes.cpp               |  41 ++--
 clang/lib/Sema/SemaObjCProperty.cpp           |   4 +-
 .../Inputs/APINotes/SomeOtherKit.apinotes     |   8 +
 .../Inputs/BrokenHeaders/APINotes.apinotes    |   5 +
 .../Inputs/BrokenHeaders/SomeBrokenLib.h      |   6 +
 .../Inputs/BrokenHeaders2/APINotes.apinotes   |   7 +
 .../Inputs/BrokenHeaders2/SomeBrokenLib.h     |   6 +
 .../FrameworkWithActualPrivateModule.h        |   1 +
 .../Modules/module.modulemap                  |   5 +
 .../Modules/module.private.modulemap          |   5 +
 ...rkWithActualPrivateModule_Private.apinotes |   1 +
 ...FrameworkWithActualPrivateModule_Private.h |   2 +
 .../Headers/FrameworkWithWrongCase.h          |   1 +
 .../Modules/module.modulemap                  |   5 +
 .../FrameworkWithWrongCase_Private.apinotes   |   1 +
 .../Headers/FrameworkWithWrongCasePrivate.h   |   1 +
 .../Modules/module.modulemap                  |   5 +
 .../Modules/module.private.modulemap          |   1 +
 ...eworkWithWrongCasePrivate_Private.apinotes |   1 +
 .../LayeredKit.framework/Headers/LayeredKit.h |  11 ++
 .../Modules/module.modulemap                  |   5 +
 .../Headers/LayeredKitImpl.apinotes           |   9 +
 .../Headers/LayeredKitImpl.h                  |   7 +
 .../Modules/module.modulemap                  |   5 +
 .../Modules/module.modulemap                  |   5 +
 .../APINotes/SomeKit.apinotes                 |  74 +++++++
 .../APINotes/SomeKit_private.apinotes         |  15 ++
 .../Headers/SomeKitForNullAnnotation.h        |  55 ++++++
 .../Modules/module.modulemap                  |   5 +
 .../Modules/module.private.modulemap          |   8 +
 .../Modules/module_private.modulemap          |   8 +
 .../PrivateHeaders/SomeKit_Private.h          |  16 ++
 .../SomeKit_PrivateForNullAnnotation.h        |  17 ++
 .../PrivateHeaders/SomeKit_private.apinotes   |  15 ++
 .../APINotes/SomeOtherKit.apinotes            |   8 +
 .../Headers/SomeOtherKit.apinotes             |   8 +
 .../Headers/SomeOtherKit.h                    |   9 +
 .../Modules/module.modulemap                  |   5 +
 .../Headers/TopLevelPrivateKit.h              |   1 +
 .../TopLevelPrivateKit_Private.apinotes       |   1 +
 .../Modules/module.modulemap                  |   5 +
 .../Modules/module.private.modulemap          |   5 +
 .../TopLevelPrivateKit.apinotes               |   1 +
 .../TopLevelPrivateKit_Private.apinotes       |   4 +
 .../TopLevelPrivateKit_Private.h              |   1 +
 ...opLevelPrivateKit_Private_private.apinotes |   1 +
 .../Headers/VersionedKit.apinotes             | 156 +++++++++++++++
 .../Headers/VersionedKit.h                    | 137 +++++++++++++
 .../Modules/module.modulemap                  |   5 +
 .../APINotes/Inputs/Headers/APINotes.apinotes |  18 ++
 .../Inputs/Headers/BrokenTypes.apinotes       |  10 +
 .../APINotes/Inputs/Headers/BrokenTypes.h     |   8 +
 .../Inputs/Headers/ExternCtx.apinotes         |  15 ++
 .../test/APINotes/Inputs/Headers/ExternCtx.h  |  11 ++
 .../Inputs/Headers/HeaderLib.apinotes         |  37 ++++
 .../test/APINotes/Inputs/Headers/HeaderLib.h  |  19 ++
 .../Headers/InstancetypeModule.apinotes       |  10 +
 .../Inputs/Headers/InstancetypeModule.h       |  10 +
 .../Inputs/Headers/ModuleWithWrongCase.h      |   1 +
 .../Headers/ModuleWithWrongCasePrivate.h      |   1 +
 ...oduleWithWrongCasePrivate_Private.apinotes |   1 +
 .../ModuleWithWrongCase_Private.apinotes      |   1 +
 .../Inputs/Headers/Namespaces.apinotes        |  53 +++++
 .../test/APINotes/Inputs/Headers/Namespaces.h |  39 ++++
 .../Inputs/Headers/PrivateLib.apinotes        |   4 +
 .../test/APINotes/Inputs/Headers/PrivateLib.h |   1 +
 .../Headers/PrivateLib_private.apinotes       |   1 +
 .../Inputs/Headers/SwiftImportAs.apinotes     |   9 +
 .../APINotes/Inputs/Headers/SwiftImportAs.h   |   6 +
 .../APINotes/Inputs/Headers/module.modulemap  |  31 +++
 .../Inputs/Headers/module.private.modulemap   |   5 +
 .../Inputs/yaml-reader-errors/UIKit.apinotes  |  65 ++++++
 .../Inputs/yaml-reader-errors/UIKit.h         |   1 +
 .../yaml-reader-errors/module.modulemap       |   3 +
 clang/test/APINotes/availability.m            |  48 +++++
 clang/test/APINotes/broken_types.m            |  19 ++
 .../APINotes/case-for-private-apinotes-file.c |  22 +++
 clang/test/APINotes/extern-context.cpp        |  23 +++
 clang/test/APINotes/instancetype.m            |   9 +
 clang/test/APINotes/module-cache.m            |  65 ++++++
 clang/test/APINotes/namespaces.cpp            |  69 +++++++
 clang/test/APINotes/nullability.c             |  21 ++
 clang/test/APINotes/nullability.m             |  46 +++++
 .../test/APINotes/objc-forward-declarations.m |  12 ++
 clang/test/APINotes/objc_designated_inits.m   |  17 ++
 clang/test/APINotes/properties.m              |  42 ++++
 clang/test/APINotes/retain-count-convention.m |  38 ++++
 clang/test/APINotes/search-order.m            |  25 +++
 clang/test/APINotes/swift-import-as.cpp       |  16 ++
 .../test/APINotes/top-level-private-modules.c |   8 +
 clang/test/APINotes/types.m                   |  28 +++
 clang/test/APINotes/versioned-multi.c         |  69 +++++++
 clang/test/APINotes/versioned.m               | 187 ++++++++++++++++++
 clang/test/APINotes/yaml-convert-diags.c      |   6 +
 clang/test/APINotes/yaml-parse-diags.c        |   6 +
 clang/test/APINotes/yaml-reader-errors.m      |   5 +
 96 files changed, 1829 insertions(+), 20 deletions(-)
 create mode 100644 clang/test/APINotes/Inputs/APINotes/SomeOtherKit.apinotes
 create mode 100644 clang/test/APINotes/Inputs/BrokenHeaders/APINotes.apinotes
 create mode 100644 clang/test/APINotes/Inputs/BrokenHeaders/SomeBrokenLib.h
 create mode 100644 clang/test/APINotes/Inputs/BrokenHeaders2/APINotes.apinotes
 create mode 100644 clang/test/APINotes/Inputs/BrokenHeaders2/SomeBrokenLib.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Headers/FrameworkWithActualPrivateModule.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.private.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Headers/FrameworkWithWrongCase.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/PrivateHeaders/FrameworkWithWrongCase_Private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Headers/FrameworkWithWrongCasePrivate.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.private.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/PrivateHeaders/FrameworkWithWrongCasePrivate_Private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Headers/LayeredKit.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SimpleKit.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit_private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Headers/SomeKitForNullAnnotation.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.private.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module_private.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_Private.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_PrivateForNullAnnotation.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/APINotes/SomeOtherKit.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit_Private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.private.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private_private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.h
 create mode 100644 clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Modules/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Headers/APINotes.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/BrokenTypes.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/BrokenTypes.h
 create mode 100644 clang/test/APINotes/Inputs/Headers/ExternCtx.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/ExternCtx.h
 create mode 100644 clang/test/APINotes/Inputs/Headers/HeaderLib.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/HeaderLib.h
 create mode 100644 clang/test/APINotes/Inputs/Headers/InstancetypeModule.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/InstancetypeModule.h
 create mode 100644 clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase.h
 create mode 100644 clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate.h
 create mode 100644 clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate_Private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase_Private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/Namespaces.h
 create mode 100644 clang/test/APINotes/Inputs/Headers/PrivateLib.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/PrivateLib.h
 create mode 100644 clang/test/APINotes/Inputs/Headers/PrivateLib_private.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
 create mode 100644 clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
 create mode 100644 clang/test/APINotes/Inputs/Headers/module.modulemap
 create mode 100644 clang/test/APINotes/Inputs/Headers/module.private.modulemap
 create mode 100644 clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.apinotes
 create mode 100644 clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.h
 create mode 100644 clang/test/APINotes/Inputs/yaml-reader-errors/module.modulemap
 create mode 100644 clang/test/APINotes/availability.m
 create mode 100644 clang/test/APINotes/broken_types.m
 create mode 100644 clang/test/APINotes/case-for-private-apinotes-file.c
 create mode 100644 clang/test/APINotes/extern-context.cpp
 create mode 100644 clang/test/APINotes/instancetype.m
 create mode 100644 clang/test/APINotes/module-cache.m
 create mode 100644 clang/test/APINotes/namespaces.cpp
 create mode 100644 clang/test/APINotes/nullability.c
 create mode 100644 clang/test/APINotes/nullability.m
 create mode 100644 clang/test/APINotes/objc-forward-declarations.m
 create mode 100644 clang/test/APINotes/objc_designated_inits.m
 create mode 100644 clang/test/APINotes/properties.m
 create mode 100644 clang/test/APINotes/retain-count-convention.m
 create mode 100644 clang/test/APINotes/search-order.m
 create mode 100644 clang/test/APINotes/swift-import-as.cpp
 create mode 100644 clang/test/APINotes/top-level-private-modules.c
 create mode 100644 clang/test/APINotes/types.m
 create mode 100644 clang/test/APINotes/versioned-multi.c
 create mode 100644 clang/test/APINotes/versioned.m
 create mode 100644 clang/test/APINotes/yaml-convert-diags.c
 create mode 100644 clang/test/APINotes/yaml-parse-diags.c
 create mode 100644 clang/test/APINotes/yaml-reader-errors.m

diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 836c633e9d2042..a3128306c664fe 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -52,49 +52,54 @@ static void applyNullability(Sema &S, Decl *D, NullabilityKind Nullability,
   if (!Metadata.IsActive)
     return;
 
-  auto IsModified = [&](Decl *D, QualType QT,
-                        NullabilityKind Nullability) -> bool {
+  auto GetModified =
+      [&](Decl *D, QualType QT,
+          NullabilityKind Nullability) -> std::optional<QualType> {
     QualType Original = QT;
     S.CheckImplicitNullabilityTypeSpecifier(QT, Nullability, D->getLocation(),
                                             isa<ParmVarDecl>(D),
                                             /*OverrideExisting=*/true);
-    return QT.getTypePtr() != Original.getTypePtr();
+    return (QT.getTypePtr() != Original.getTypePtr()) ? std::optional(QT)
+                                                      : std::nullopt;
   };
 
   if (auto Function = dyn_cast<FunctionDecl>(D)) {
-    if (IsModified(D, Function->getReturnType(), Nullability)) {
-      QualType FnType = Function->getType();
-      Function->setType(FnType);
+    if (auto Modified =
+            GetModified(D, Function->getReturnType(), Nullability)) {
+      const FunctionType *FnType = Function->getType()->castAs<FunctionType>();
+      if (const FunctionProtoType *proto = dyn_cast<FunctionProtoType>(FnType))
+        Function->setType(S.Context.getFunctionType(
+            *Modified, proto->getParamTypes(), proto->getExtProtoInfo()));
+      else
+        Function->setType(
+            S.Context.getFunctionNoProtoType(*Modified, FnType->getExtInfo()));
     }
   } else if (auto Method = dyn_cast<ObjCMethodDecl>(D)) {
-    QualType Type = Method->getReturnType();
-    if (IsModified(D, Type, Nullability)) {
-      Method->setReturnType(Type);
+    if (auto Modified = GetModified(D, Method->getReturnType(), Nullability)) {
+      Method->setReturnType(*Modified);
 
       // Make it a context-sensitive keyword if we can.
-      if (!isIndirectPointerType(Type))
+      if (!isIndirectPointerType(*Modified))
         Method->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
             Method->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
     }
   } else if (auto Value = dyn_cast<ValueDecl>(D)) {
-    QualType Type = Value->getType();
-    if (IsModified(D, Type, Nullability)) {
-      Value->setType(Type);
+    if (auto Modified = GetModified(D, Value->getType(), Nullability)) {
+      Value->setType(*Modified);
 
       // Make it a context-sensitive keyword if we can.
       if (auto Parm = dyn_cast<ParmVarDecl>(D)) {
-        if (Parm->isObjCMethodParameter() && !isIndirectPointerType(Type))
+        if (Parm->isObjCMethodParameter() && !isIndirectPointerType(*Modified))
           Parm->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
               Parm->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
       }
     }
   } else if (auto Property = dyn_cast<ObjCPropertyDecl>(D)) {
-    QualType Type = Property->getType();
-    if (IsModified(D, Type, Nullability)) {
-      Property->setType(Type, Property->getTypeSourceInfo());
+    if (auto Modified = GetModified(D, Property->getType(), Nullability)) {
+      Property->setType(*Modified, Property->getTypeSourceInfo());
 
       // Make it a property attribute if we can.
-      if (!isIndirectPointerType(Type))
+      if (!isIndirectPointerType(*Modified))
         Property->setPropertyAttributes(
             ObjCPropertyAttribute::kind_null_resettable);
     }
diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp
index 4636d89ebf2b84..f9e1ad0121e2a2 100644
--- a/clang/lib/Sema/SemaObjCProperty.cpp
+++ b/clang/lib/Sema/SemaObjCProperty.cpp
@@ -638,8 +638,6 @@ ObjCPropertyDecl *Sema::CreatePropertyDecl(Scope *S,
     PDecl->setInvalidDecl();
   }
 
-  ProcessDeclAttributes(S, PDecl, FD.D);
-
   // Regardless of setter/getter attribute, we save the default getter/setter
   // selector names in anticipation of declaration of setter/getter methods.
   PDecl->setGetterName(GetterSel, GetterNameLoc);
@@ -647,6 +645,8 @@ ObjCPropertyDecl *Sema::CreatePropertyDecl(Scope *S,
   PDecl->setPropertyAttributesAsWritten(
                           makePropertyAttributesAsWritten(AttributesAsWritten));
 
+  ProcessDeclAttributes(S, PDecl, FD.D);
+
   if (Attributes & ObjCPropertyAttribute::kind_readonly)
     PDecl->setPropertyAttributes(ObjCPropertyAttribute::kind_readonly);
 
diff --git a/clang/test/APINotes/Inputs/APINotes/SomeOtherKit.apinotes b/clang/test/APINotes/Inputs/APINotes/SomeOtherKit.apinotes
new file mode 100644
index 00000000000000..ccdc4e15d34d1b
--- /dev/null
+++ b/clang/test/APINotes/Inputs/APINotes/SomeOtherKit.apinotes
@@ -0,0 +1,8 @@
+Name: SomeOtherKit
+Classes:
+  - Name: A
+    Methods:
+      - Selector:        "methodB"
+        MethodKind:      Instance
+        Availability:    none
+        AvailabilityMsg: "anything but this"
diff --git a/clang/test/APINotes/Inputs/BrokenHeaders/APINotes.apinotes b/clang/test/APINotes/Inputs/BrokenHeaders/APINotes.apinotes
new file mode 100644
index 00000000000000..cd5475b1342315
--- /dev/null
+++ b/clang/test/APINotes/Inputs/BrokenHeaders/APINotes.apinotes
@@ -0,0 +1,5 @@
+Name: SomeBrokenLib
+Functions:
+  - Name: do_something_with_pointers
+    Nu llabilityOfRet: O
+    # the space is intentional, to make sure we don't crash on malformed API Notes
diff --git a/clang/test/APINotes/Inputs/BrokenHeaders/SomeBrokenLib.h b/clang/test/APINotes/Inputs/BrokenHeaders/SomeBrokenLib.h
new file mode 100644
index 00000000000000..b09c6f63eae02e
--- /dev/null
+++ b/clang/test/APINotes/Inputs/BrokenHeaders/SomeBrokenLib.h
@@ -0,0 +1,6 @@
+#ifndef SOME_BROKEN_LIB_H
+#define SOME_BROKEN_LIB_H
+
+void do_something_with_pointers(int *ptr1, int *ptr2);
+
+#endif // SOME_BROKEN_LIB_H
diff --git a/clang/test/APINotes/Inputs/BrokenHeaders2/APINotes.apinotes b/clang/test/APINotes/Inputs/BrokenHeaders2/APINotes.apinotes
new file mode 100644
index 00000000000000..33eeaaada999d6
--- /dev/null
+++ b/clang/test/APINotes/Inputs/BrokenHeaders2/APINotes.apinotes
@@ -0,0 +1,7 @@
+Name: SomeBrokenLib
+Functions:
+  - Name: do_something_with_pointers
+    NullabilityOfRet: O
+  - Name: do_something_with_pointers
+    NullabilityOfRet: O
+    
diff --git a/clang/test/APINotes/Inputs/BrokenHeaders2/SomeBrokenLib.h b/clang/test/APINotes/Inputs/BrokenHeaders2/SomeBrokenLib.h
new file mode 100644
index 00000000000000..b09c6f63eae02e
--- /dev/null
+++ b/clang/test/APINotes/Inputs/BrokenHeaders2/SomeBrokenLib.h
@@ -0,0 +1,6 @@
+#ifndef SOME_BROKEN_LIB_H
+#define SOME_BROKEN_LIB_H
+
+void do_something_with_pointers(int *ptr1, int *ptr2);
+
+#endif // SOME_BROKEN_LIB_H
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Headers/FrameworkWithActualPrivateModule.h b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Headers/FrameworkWithActualPrivateModule.h
new file mode 100644
index 00000000000000..523de4f7ce0857
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Headers/FrameworkWithActualPrivateModule.h
@@ -0,0 +1 @@
+extern int FrameworkWithActualPrivateModule;
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..859d723716be21
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module FrameworkWithActualPrivateModule {
+  umbrella header "FrameworkWithActualPrivateModule.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.private.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.private.modulemap
new file mode 100644
index 00000000000000..e7fafe3bcbb17f
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/Modules/module.private.modulemap
@@ -0,0 +1,5 @@
+framework module FrameworkWithActualPrivateModule_Private {
+  umbrella header "FrameworkWithActualPrivateModule_Private.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.apinotes
new file mode 100644
index 00000000000000..831cf1e93d3519
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.apinotes
@@ -0,0 +1 @@
+Name: FrameworkWithActualPrivateModule_Private
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.h b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.h
new file mode 100644
index 00000000000000..c07a3e95d74049
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithActualPrivateModule.framework/PrivateHeaders/FrameworkWithActualPrivateModule_Private.h
@@ -0,0 +1,2 @@
+#include <FrameworkWithActualPrivateModule/FrameworkWithActualPrivateModule.h>
+extern int FrameworkWithActualPrivateModule_Private;
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Headers/FrameworkWithWrongCase.h b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Headers/FrameworkWithWrongCase.h
new file mode 100644
index 00000000000000..4f3b631c27e30d
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Headers/FrameworkWithWrongCase.h
@@ -0,0 +1 @@
+extern int FrameworkWithWrongCase;
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..e97d361039a150
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module FrameworkWithWrongCase {
+  umbrella header "FrameworkWithWrongCase.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/PrivateHeaders/FrameworkWithWrongCase_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/PrivateHeaders/FrameworkWithWrongCase_Private.apinotes
new file mode 100644
index 00000000000000..ae5447c61e33d0
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCase.framework/PrivateHeaders/FrameworkWithWrongCase_Private.apinotes
@@ -0,0 +1 @@
+Name: FrameworkWithWrongCase
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Headers/FrameworkWithWrongCasePrivate.h b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Headers/FrameworkWithWrongCasePrivate.h
new file mode 100644
index 00000000000000..d3d61483191c6e
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Headers/FrameworkWithWrongCasePrivate.h
@@ -0,0 +1 @@
+extern int FrameworkWithWrongCasePrivate;
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..04b96adbbfeb99
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module FrameworkWithWrongCasePrivate {
+  umbrella header "FrameworkWithWrongCasePrivate.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.private.modulemap b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.private.modulemap
new file mode 100644
index 00000000000000..d6ad53cdc71797
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/Modules/module.private.modulemap
@@ -0,0 +1 @@
+module FrameworkWithWrongCasePrivate.Inner {}
diff --git a/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/PrivateHeaders/FrameworkWithWrongCasePrivate_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/PrivateHeaders/FrameworkWithWrongCasePrivate_Private.apinotes
new file mode 100644
index 00000000000000..d7af293e8125f1
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/FrameworkWithWrongCasePrivate.framework/PrivateHeaders/FrameworkWithWrongCasePrivate_Private.apinotes
@@ -0,0 +1 @@
+Name: FrameworkWithWrongCasePrivate
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Headers/LayeredKit.h b/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Headers/LayeredKit.h
new file mode 100644
index 00000000000000..a95d19ecbe9afc
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Headers/LayeredKit.h
@@ -0,0 +1,11 @@
+ at import LayeredKitImpl;
+
+// @interface declarations already don't inherit attributes from forward 
+// declarations, so in order to test this properly we have to /not/ define
+// UpwardClass anywhere.
+
+// @interface UpwardClass
+// @end
+
+ at protocol UpwardProto
+ at end
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..04bbe72a2b6e25
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module LayeredKit {
+  umbrella header "LayeredKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.apinotes b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.apinotes
new file mode 100644
index 00000000000000..bece28cfe60577
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.apinotes
@@ -0,0 +1,9 @@
+Name: LayeredKitImpl
+Classes:
+- Name: PerfectlyNormalClass
+  Availability: none
+- Name: UpwardClass
+  Availability: none
+Protocols:
+- Name: UpwardProto
+  Availability: none
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.h b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.h
new file mode 100644
index 00000000000000..99591d35803aa1
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Headers/LayeredKitImpl.h
@@ -0,0 +1,7 @@
+ at protocol UpwardProto;
+ at class UpwardClass;
+
+ at interface PerfectlyNormalClass
+ at end
+
+void doImplementationThings(UpwardClass *first, id <UpwardProto> second) __attribute((unavailable));
diff --git a/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..58a6e55c1067f9
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/LayeredKitImpl.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module LayeredKitImpl {
+  umbrella header "LayeredKitImpl.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SimpleKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/SimpleKit.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..2d07e76c0a142a
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SimpleKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module SimpleKit {
+  umbrella header "SimpleKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit.apinotes
new file mode 100644
index 00000000000000..817af123fc77b6
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit.apinotes
@@ -0,0 +1,74 @@
+Name: SomeKit
+Classes:
+  - Name: A
+    Methods:
+      - Selector:        "transform:"
+        MethodKind:      Instance
+        Availability:    none
+        AvailabilityMsg: "anything but this"
+      - Selector: "transform:integer:"
+        MethodKind:      Instance
+        NullabilityOfRet: N
+        Nullability:      [ N, S ]
+    Properties:
+      - Name: intValue
+        PropertyKind:    Instance
+        Availability: none
+        AvailabilityMsg: "wouldn't work anyway"
+      - Name: nonnullAInstance
+        PropertyKind:    Instance
+        Nullability:     N
+      - Name: nonnullAClass
+        PropertyKind:    Class
+        Nullability:     N
+      - Name: nonnullABoth
+        Nullability:     N
+  - Name: B
+    Availability: none
+    AvailabilityMsg: "just don't"
+  - Name: C
+    Methods:
+      - Selector: "initWithA:"
+        MethodKind: Instance
+        DesignatedInit: true
+  - Name: OverriddenTypes
+    Methods:
+      - Selector: "methodToMangle:second:"
+        MethodKind: Instance
+        ResultType: 'char *'
+        Parameters:
+          - Position: 0
+            Type: 'SOMEKIT_DOUBLE *'
+          - Position: 1
+            Type: 'float *'
+    Properties:
+      - Name: intPropertyToMangle
+        PropertyKind: Instance
+        Type: 'double *'
+Functions:
+  - Name: global_int_fun
+    ResultType: 'char *'
+    Parameters:
+      - Position: 0
+        Type: 'double *'
+      - Position: 1
+        Type: 'float *'
+Globals:
+  - Name: global_int_ptr
+    Type: 'double *'
+SwiftVersions:
+  - Version: 3.0
+    Classes:
+      - Name: A
+        Methods:
+          - Selector: "transform:integer:"
+            MethodKind:      Instance
+            NullabilityOfRet: O
+            Nullability:      [ O, S ]
+        Properties:
+          - Name: explicitNonnullInstance
+            PropertyKind:    Instance
+            Nullability:     O
+          - Name: explicitNullableInstance
+            PropertyKind:    Instance
+            Nullability:     N
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit_private.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit_private.apinotes
new file mode 100644
index 00000000000000..28ede9dfa25c08
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/APINotes/SomeKit_private.apinotes
@@ -0,0 +1,15 @@
+Name: SomeKit
+Classes:
+  - Name: A
+    Methods:         
+      - Selector: "privateTransform:input:"
+        MethodKind:      Instance
+        NullabilityOfRet: N
+        Nullability:      [ N, S ]
+    Properties:
+      - Name: internalProperty
+        Nullability: N
+Protocols:
+  - Name: InternalProtocol
+    Availability: none
+    AvailabilityMsg: "not for you"
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Headers/SomeKitForNullAnnotation.h b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Headers/SomeKitForNullAnnotation.h
new file mode 100644
index 00000000000000..bc0c5da8848e9a
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Headers/SomeKitForNullAnnotation.h
@@ -0,0 +1,55 @@
+#ifndef SOMEKIT_H
+#define SOMEKIT_H
+
+#define ROOT_CLASS __attribute__((objc_root_class))
+
+ROOT_CLASS
+ at interface A
+-(A*)transform:(A*)input;
+-(A*)transform:(A*)input integer:(int)integer;
+
+ at property (nonatomic, readonly, retain) A* someA;
+ at property (nonatomic, retain) A* someOtherA;
+
+ at property (nonatomic) int intValue;
+ at end
+
+ at interface B : A
+ at end
+
+ at interface C : A
+- (instancetype)init;
+- (instancetype)initWithA:(A*)a;
+ at end
+
+
+ at interface MyClass : A
+- Inst;
++ Clas;
+ at end
+
+struct CGRect {
+  float origin;
+  float size;
+};
+typedef struct CGRect NSRect;
+
+ at interface I
+- (void) Meth : (NSRect[4])exposedRects;
+- (void) Meth1 : (const  I*)exposedRects;
+- (void) Meth2 : (const I*)exposedRects;
+- (void) Meth3 : (I*)exposedRects;
+- (const I*) Meth4;
+- (const I*) Meth5 : (int) Arg1 : (const I*)Arg2 : (double)Arg3 :   (const I*) Arg4 :(const  volatile id) Arg5;
+- (volatile const I*) Meth6 : (const char *)Arg1 : (const char *)Arg2 : (double)Arg3 :   (const I*) Arg4 :(const  volatile id) Arg5;
+ at end
+
+ at class NSURL, NSArray, NSError;
+ at interface INTF_BLOCKS
+  + (void)getNonLocalVersionsOfItemAtURL:(NSURL *)url completionHandler:(void (^)(NSArray *nonLocalFileVersions, NSError *error))completionHandler;
+  + (void *)getNonLocalVersionsOfItemAtURL2:(NSURL *)url completionHandler:(void (^)(NSArray *nonLocalFileVersions, NSError *error))completionHandler;
+  + (NSError **)getNonLocalVersionsOfItemAtURL3:(int)url completionHandler:(void (^)(NSArray *nonLocalFileVersions, NSError *error))completionHandler;
+  + (id)getNonLocalVersionsOfItemAtURL4:(NSURL *)url completionHandler:(void (^)(int nonLocalFileVersions, NSError *error, NSURL*))completionHandler;
+ at end
+
+#endif
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..3abee2df0be1b7
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module SomeKit {
+  umbrella header "SomeKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.private.modulemap b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.private.modulemap
new file mode 100644
index 00000000000000..bbda9d08e3993e
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module.private.modulemap
@@ -0,0 +1,8 @@
+module SomeKit.Private {
+  header "SomeKit_Private.h"
+  export *
+
+  explicit module NullAnnotation {
+    header "SomeKit_PrivateForNullAnnotation.h"
+  }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module_private.modulemap b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module_private.modulemap
new file mode 100644
index 00000000000000..e31034317cb82a
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/Modules/module_private.modulemap
@@ -0,0 +1,8 @@
+explicit framework module SomeKit.Private {
+  header "SomeKit_Private.h"
+  explicit NullAnnotation { header "SomeKit_PrivateForNullAnnotation.h" }
+  export *
+  module * { export * }
+syntax error
+
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_Private.h b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_Private.h
new file mode 100644
index 00000000000000..c7611123e4ad2f
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_Private.h
@@ -0,0 +1,16 @@
+#ifndef SOMEKIT_PRIVATE_H
+#define SOMEKIT_PRIVATE_H
+
+#import <SomeKit/SomeKit.h>
+
+ at interface A(Private)
+-(A*)privateTransform:(A*)input;
+
+ at property (nonatomic) A* internalProperty;
+ at end
+
+ at protocol InternalProtocol
+ at end
+
+#endif
+
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_PrivateForNullAnnotation.h b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_PrivateForNullAnnotation.h
new file mode 100644
index 00000000000000..bae4456b408093
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_PrivateForNullAnnotation.h
@@ -0,0 +1,17 @@
+#ifndef SOMEKIT_PRIVATE_H
+#define SOMEKIT_PRIVATE_H
+
+#import <SomeKit/SomeKitForNullAnnotation.h>
+
+ at interface A(Private)
+-(A*)privateTransform:(A*)input;
+
+ at property (nonatomic) A* internalProperty;
+ at end
+
+ at protocol InternalProtocol
+- (id) MomeMethod;
+ at end
+
+#endif
+
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_private.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_private.apinotes
new file mode 100644
index 00000000000000..28ede9dfa25c08
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeKit.framework/PrivateHeaders/SomeKit_private.apinotes
@@ -0,0 +1,15 @@
+Name: SomeKit
+Classes:
+  - Name: A
+    Methods:         
+      - Selector: "privateTransform:input:"
+        MethodKind:      Instance
+        NullabilityOfRet: N
+        Nullability:      [ N, S ]
+    Properties:
+      - Name: internalProperty
+        Nullability: N
+Protocols:
+  - Name: InternalProtocol
+    Availability: none
+    AvailabilityMsg: "not for you"
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/APINotes/SomeOtherKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/APINotes/SomeOtherKit.apinotes
new file mode 100644
index 00000000000000..2ad546b8f8bcca
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/APINotes/SomeOtherKit.apinotes
@@ -0,0 +1,8 @@
+Name: SomeOtherKit
+Classes:
+  - Name: A
+    Methods:
+      - Selector:        "methodA"
+        MethodKind:      Instance
+        Availability:    none
+        AvailabilityMsg: "anything but this"
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.apinotes
new file mode 100644
index 00000000000000..2ad546b8f8bcca
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.apinotes
@@ -0,0 +1,8 @@
+Name: SomeOtherKit
+Classes:
+  - Name: A
+    Methods:
+      - Selector:        "methodA"
+        MethodKind:      Instance
+        Availability:    none
+        AvailabilityMsg: "anything but this"
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.h b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.h
new file mode 100644
index 00000000000000..3911d765230c69
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.h
@@ -0,0 +1,9 @@
+#ifndef SOME_OTHER_KIT_H
+
+__attribute__((objc_root_class))
+ at interface A
+-(void)methodA;
+-(void)methodB;
+ at end
+
+#endif
diff --git a/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..0aaad92e041ce3
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/SomeOtherKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module SomeOtherKit {
+  umbrella header "SomeOtherKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit.h b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit.h
new file mode 100644
index 00000000000000..d3376f1dac5d11
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit.h
@@ -0,0 +1 @@
+extern int TopLevelPrivateKit_Public;
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit_Private.apinotes
new file mode 100644
index 00000000000000..ece1dd220adf52
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Headers/TopLevelPrivateKit_Private.apinotes
@@ -0,0 +1 @@
+garbage here because this file shouldn't get read
\ No newline at end of file
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..70faa54e834778
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module TopLevelPrivateKit {
+  umbrella header "TopLevelPrivateKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.private.modulemap b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.private.modulemap
new file mode 100644
index 00000000000000..0958a14d671089
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/Modules/module.private.modulemap
@@ -0,0 +1,5 @@
+framework module TopLevelPrivateKit_Private {
+  umbrella header "TopLevelPrivateKit_Private.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit.apinotes
new file mode 100644
index 00000000000000..908dae0e3b0b24
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit.apinotes
@@ -0,0 +1 @@
+garbage here because this file shouldn't get read
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.apinotes b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.apinotes
new file mode 100644
index 00000000000000..43323621588bb2
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.apinotes
@@ -0,0 +1,4 @@
+Name: TopLevelPrivateKit_Private
+Globals:
+- Name: TopLevelPrivateKit_Private
+  Type: float
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.h b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.h
new file mode 100644
index 00000000000000..39cbfe6e9918ba
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private.h
@@ -0,0 +1 @@
+extern int TopLevelPrivateKit_Private;
diff --git a/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private_private.apinotes b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private_private.apinotes
new file mode 100644
index 00000000000000..ece1dd220adf52
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/TopLevelPrivateKit.framework/PrivateHeaders/TopLevelPrivateKit_Private_private.apinotes
@@ -0,0 +1 @@
+garbage here because this file shouldn't get read
\ No newline at end of file
diff --git a/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.apinotes b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.apinotes
new file mode 100644
index 00000000000000..572c714b3d61a7
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.apinotes
@@ -0,0 +1,156 @@
+Name: VersionedKit
+Classes:
+  - Name: TestProperties
+    SwiftObjCMembers: true
+    Properties:
+      - Name: accessorsOnly
+        PropertyKind:    Instance
+        SwiftImportAsAccessors: true
+      - Name: accessorsOnlyForClass
+        PropertyKind:    Class
+        SwiftImportAsAccessors: true
+      - Name: accessorsOnlyExceptInVersion3
+        PropertyKind:    Instance
+        SwiftImportAsAccessors: true
+      - Name: accessorsOnlyForClassExceptInVersion3
+        PropertyKind:    Class
+        SwiftImportAsAccessors: true
+Functions:
+  - Name: unversionedRenameDUMP
+    SwiftName: 'unversionedRename_NOTES()'
+Tags:
+  - Name: APINotedFlagEnum
+    FlagEnum: true
+  - Name: APINotedOpenEnum
+    EnumExtensibility: open
+  - Name: APINotedClosedEnum
+    EnumExtensibility: closed
+  - Name: SoonToBeCFEnum
+    EnumKind: CFEnum
+  - Name: SoonToBeNSEnum
+    EnumKind: NSEnum
+  - Name: SoonToBeCFOptions
+    EnumKind: CFOptions
+  - Name: SoonToBeNSOptions
+    EnumKind: NSOptions
+  - Name: SoonToBeCFClosedEnum
+    EnumKind: CFClosedEnum
+  - Name: SoonToBeNSClosedEnum
+    EnumKind: NSClosedEnum
+  - Name: UndoAllThatHasBeenDoneToMe
+    EnumKind: none
+Typedefs:
+  - Name: MultiVersionedTypedef34Notes
+    SwiftName: MultiVersionedTypedef34Notes_NEW
+  - Name: MultiVersionedTypedef345Notes
+    SwiftName: MultiVersionedTypedef345Notes_NEW
+  - Name: MultiVersionedTypedef4Notes
+    SwiftName: MultiVersionedTypedef4Notes_NEW
+  - Name: MultiVersionedTypedef45Notes
+    SwiftName: MultiVersionedTypedef45Notes_NEW
+SwiftVersions:
+  - Version: 3.0
+    Classes:
+      - Name: MyReferenceType
+        SwiftBridge: ''
+      - Name: TestGenericDUMP
+        SwiftImportAsNonGeneric: true
+      - Name: TestProperties
+        SwiftObjCMembers: false
+        Properties:
+          - Name: accessorsOnlyInVersion3
+            PropertyKind:    Instance
+            SwiftImportAsAccessors: true
+          - Name: accessorsOnlyForClassInVersion3
+            PropertyKind:    Class
+            SwiftImportAsAccessors: true
+          - Name: accessorsOnlyExceptInVersion3
+            PropertyKind:    Instance
+            SwiftImportAsAccessors: false
+          - Name: accessorsOnlyForClassExceptInVersion3
+            PropertyKind:    Class
+            SwiftImportAsAccessors: false
+      - Name: Swift3RenamedOnlyDUMP
+        SwiftName: SpecialSwift3Name
+      - Name: Swift3RenamedAlsoDUMP
+        SwiftName: SpecialSwift3Also
+    Functions:
+      - Name: moveToPointDUMP
+        SwiftName: 'moveTo(a:b:)'
+      - Name: acceptClosure
+        Parameters:      
+          - Position:        0
+            NoEscape:        false
+      - Name: privateFunc
+        SwiftPrivate: false
+    Tags:
+      - Name: MyErrorCode
+        NSErrorDomain: ''
+      - Name: NewlyFlagEnum
+        FlagEnum: false
+      - Name: OpenToClosedEnum
+        EnumExtensibility: open
+      - Name: ClosedToOpenEnum
+        EnumExtensibility: closed
+      - Name: NewlyClosedEnum
+        EnumExtensibility: none
+      - Name: NewlyOpenEnum
+        EnumExtensibility: none
+    Typedefs:
+      - Name: MyDoubleWrapper
+        SwiftWrapper: none
+      - Name: MultiVersionedTypedef34
+        SwiftName: MultiVersionedTypedef34_3
+      - Name: MultiVersionedTypedef34Header
+        SwiftName: MultiVersionedTypedef34Header_3
+      - Name: MultiVersionedTypedef34Notes
+        SwiftName: MultiVersionedTypedef34Notes_3
+      - Name: MultiVersionedTypedef345
+        SwiftName: MultiVersionedTypedef345_3
+      - Name: MultiVersionedTypedef345Header
+        SwiftName: MultiVersionedTypedef345Header_3
+      - Name: MultiVersionedTypedef345Notes
+        SwiftName: MultiVersionedTypedef345Notes_3
+  - Version: 5
+    Typedefs:
+      - Name: MultiVersionedTypedef345
+        SwiftName: MultiVersionedTypedef345_5
+      - Name: MultiVersionedTypedef345Header
+        SwiftName: MultiVersionedTypedef345Header_5
+      - Name: MultiVersionedTypedef345Notes
+        SwiftName: MultiVersionedTypedef345Notes_5
+      - Name: MultiVersionedTypedef45
+        SwiftName: MultiVersionedTypedef45_5
+      - Name: MultiVersionedTypedef45Header
+        SwiftName: MultiVersionedTypedef45Header_5
+      - Name: MultiVersionedTypedef45Notes
+        SwiftName: MultiVersionedTypedef45Notes_5
+  - Version: 4 # Versions are deliberately ordered as "3, 5, 4" to catch bugs.
+    Classes:
+      - Name: Swift4RenamedDUMP
+        SwiftName: SpecialSwift4Name
+    Typedefs:
+      - Name: MultiVersionedTypedef34
+        SwiftName: MultiVersionedTypedef34_4
+      - Name: MultiVersionedTypedef34Header
+        SwiftName: MultiVersionedTypedef34Header_4
+      - Name: MultiVersionedTypedef34Notes
+        SwiftName: MultiVersionedTypedef34Notes_4
+      - Name: MultiVersionedTypedef345
+        SwiftName: MultiVersionedTypedef345_4
+      - Name: MultiVersionedTypedef345Header
+        SwiftName: MultiVersionedTypedef345Header_4
+      - Name: MultiVersionedTypedef345Notes
+        SwiftName: MultiVersionedTypedef345Notes_4
+      - Name: MultiVersionedTypedef4
+        SwiftName: MultiVersionedTypedef4_4
+      - Name: MultiVersionedTypedef4Header
+        SwiftName: MultiVersionedTypedef4Header_4
+      - Name: MultiVersionedTypedef4Notes
+        SwiftName: MultiVersionedTypedef4Notes_4
+      - Name: MultiVersionedTypedef45
+        SwiftName: MultiVersionedTypedef45_4
+      - Name: MultiVersionedTypedef45Header
+        SwiftName: MultiVersionedTypedef45Header_4
+      - Name: MultiVersionedTypedef45Notes
+        SwiftName: MultiVersionedTypedef45Notes_4
diff --git a/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.h b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.h
new file mode 100644
index 00000000000000..9ce95633c523b7
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Headers/VersionedKit.h
@@ -0,0 +1,137 @@
+void moveToPointDUMP(double x, double y) __attribute__((swift_name("moveTo(x:y:)")));
+
+void unversionedRenameDUMP(void) __attribute__((swift_name("unversionedRename_HEADER()")));
+
+void acceptClosure(void (^ __attribute__((noescape)) block)(void));
+
+void privateFunc(void) __attribute__((swift_private));
+
+typedef double MyDoubleWrapper __attribute__((swift_wrapper(struct)));
+
+#if __OBJC__
+ at class NSString;
+
+extern NSString *MyErrorDomain;
+
+enum __attribute__((ns_error_domain(MyErrorDomain))) MyErrorCode {
+  MyErrorCodeFailed = 1
+};
+
+__attribute__((swift_bridge("MyValueType")))
+ at interface MyReferenceType
+ at end
+
+ at interface TestProperties
+ at property (nonatomic, readwrite, retain) id accessorsOnly;
+ at property (nonatomic, readwrite, retain, class) id accessorsOnlyForClass;
+
+ at property (nonatomic, readwrite, retain) id accessorsOnlyInVersion3;
+ at property (nonatomic, readwrite, retain, class) id accessorsOnlyForClassInVersion3;
+
+ at property (nonatomic, readwrite, retain) id accessorsOnlyExceptInVersion3;
+ at property (nonatomic, readwrite, retain, class) id accessorsOnlyForClassExceptInVersion3;
+ at end
+
+ at interface Base
+ at end
+
+ at interface TestGenericDUMP<Element> : Base
+- (Element)element;
+ at end
+
+ at interface Swift3RenamedOnlyDUMP
+ at end
+
+__attribute__((swift_name("Swift4Name")))
+ at interface Swift3RenamedAlsoDUMP
+ at end
+
+ at interface Swift4RenamedDUMP
+ at end
+
+#endif
+
+
+enum __attribute__((flag_enum)) FlagEnum {
+  FlagEnumA = 1,
+  FlagEnumB = 2
+};
+
+enum __attribute__((flag_enum)) NewlyFlagEnum {
+  NewlyFlagEnumA = 1,
+  NewlyFlagEnumB = 2
+};
+
+enum APINotedFlagEnum {
+  APINotedFlagEnumA = 1,
+  APINotedFlagEnumB = 2
+};
+
+
+enum __attribute__((enum_extensibility(open))) OpenEnum {
+  OpenEnumA = 1,
+};
+
+enum __attribute__((enum_extensibility(open))) NewlyOpenEnum {
+  NewlyOpenEnumA = 1,
+};
+
+enum __attribute__((enum_extensibility(closed))) NewlyClosedEnum {
+  NewlyClosedEnumA = 1,
+};
+
+enum __attribute__((enum_extensibility(open))) ClosedToOpenEnum {
+  ClosedToOpenEnumA = 1,
+};
+
+enum __attribute__((enum_extensibility(closed))) OpenToClosedEnum {
+  OpenToClosedEnumA = 1,
+};
+
+enum APINotedOpenEnum {
+  APINotedOpenEnumA = 1,
+};
+
+enum APINotedClosedEnum {
+  APINotedClosedEnumA = 1,
+};
+
+
+enum SoonToBeCFEnum {
+  SoonToBeCFEnumA = 1
+};
+enum SoonToBeNSEnum {
+  SoonToBeNSEnumA = 1
+};
+enum SoonToBeCFOptions {
+  SoonToBeCFOptionsA = 1
+};
+enum SoonToBeNSOptions {
+  SoonToBeNSOptionsA = 1
+};
+enum SoonToBeCFClosedEnum {
+  SoonToBeCFClosedEnumA = 1
+};
+enum SoonToBeNSClosedEnum {
+  SoonToBeNSClosedEnumA = 1
+};
+enum UndoAllThatHasBeenDoneToMe {
+  UndoAllThatHasBeenDoneToMeA = 1
+} __attribute__((flag_enum)) __attribute__((enum_extensibility(closed)));
+
+
+typedef int MultiVersionedTypedef4;
+typedef int MultiVersionedTypedef4Notes;
+typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_NEW")));
+
+typedef int MultiVersionedTypedef34;
+typedef int MultiVersionedTypedef34Notes;
+typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_NEW")));
+
+typedef int MultiVersionedTypedef45;
+typedef int MultiVersionedTypedef45Notes;
+typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_NEW")));
+
+typedef int MultiVersionedTypedef345;
+typedef int MultiVersionedTypedef345Notes;
+typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_NEW")));
diff --git a/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Modules/module.modulemap b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Modules/module.modulemap
new file mode 100644
index 00000000000000..6d957fd68009f0
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Frameworks/VersionedKit.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module VersionedKit {
+  umbrella header "VersionedKit.h"
+  export *
+  module * { export * }
+}
diff --git a/clang/test/APINotes/Inputs/Headers/APINotes.apinotes b/clang/test/APINotes/Inputs/Headers/APINotes.apinotes
new file mode 100644
index 00000000000000..08210fc7056513
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/APINotes.apinotes
@@ -0,0 +1,18 @@
+Name: HeaderLib
+SwiftInferImportAsMember: true
+Functions:
+  - Name: custom_realloc
+    NullabilityOfRet: N
+    Nullability: [ N, S ]
+  - Name: unavailable_function
+    Availability: none
+    AvailabilityMsg: "I beg you not to use this"
+  - Name: do_something_with_pointers
+    NullabilityOfRet: O
+    Nullability: [ N, O ]
+    
+Globals:
+  - Name: global_int
+    Nullability: N
+  - Name: unavailable_global_int
+    Availability: none
diff --git a/clang/test/APINotes/Inputs/Headers/BrokenTypes.apinotes b/clang/test/APINotes/Inputs/Headers/BrokenTypes.apinotes
new file mode 100644
index 00000000000000..00f7b5074e9850
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/BrokenTypes.apinotes
@@ -0,0 +1,10 @@
+Name: BrokenTypes
+Functions:
+  - Name: break_me_function
+    ResultType: 'int * with extra junk'
+    Parameters:
+      - Position: 0
+        Type: 'not_a_type'
+Globals:
+  - Name: break_me_variable
+    Type: 'double'
diff --git a/clang/test/APINotes/Inputs/Headers/BrokenTypes.h b/clang/test/APINotes/Inputs/Headers/BrokenTypes.h
new file mode 100644
index 00000000000000..fee054b74cf701
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/BrokenTypes.h
@@ -0,0 +1,8 @@
+#ifndef BROKEN_TYPES_H
+#define BROKEN_TYPES_H
+
+char break_me_function(void *ptr);
+
+extern char break_me_variable;
+
+#endif // BROKEN_TYPES_H
diff --git a/clang/test/APINotes/Inputs/Headers/ExternCtx.apinotes b/clang/test/APINotes/Inputs/Headers/ExternCtx.apinotes
new file mode 100644
index 00000000000000..0f47ac6deea85d
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ExternCtx.apinotes
@@ -0,0 +1,15 @@
+Name: ExternCtx
+Globals:
+  - Name: globalInExternC
+    Availability: none
+    AvailabilityMsg: "oh no"
+  - Name: globalInExternCXX
+    Availability: none
+    AvailabilityMsg: "oh no #2"
+Functions:
+  - Name: globalFuncInExternC
+    Availability: none
+    AvailabilityMsg: "oh no #3"
+  - Name: globalFuncInExternCXX
+    Availability: none
+    AvailabilityMsg: "oh no #4"
diff --git a/clang/test/APINotes/Inputs/Headers/ExternCtx.h b/clang/test/APINotes/Inputs/Headers/ExternCtx.h
new file mode 100644
index 00000000000000..669d443f60ecf1
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ExternCtx.h
@@ -0,0 +1,11 @@
+extern "C" {
+  static int globalInExternC = 1;
+
+  static void globalFuncInExternC() {}
+}
+
+extern "C++" {
+  static int globalInExternCXX = 2;
+
+  static void globalFuncInExternCXX() {}
+}
diff --git a/clang/test/APINotes/Inputs/Headers/HeaderLib.apinotes b/clang/test/APINotes/Inputs/Headers/HeaderLib.apinotes
new file mode 100644
index 00000000000000..7dcb22476a1d26
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/HeaderLib.apinotes
@@ -0,0 +1,37 @@
+Name: HeaderLib
+SwiftInferImportAsMember: true
+Functions:
+  - Name: custom_realloc
+    NullabilityOfRet: N
+    Nullability: [ N, S ]
+  - Name: unavailable_function
+    Availability: none
+    AvailabilityMsg: "I beg you not to use this"
+  - Name: do_something_with_pointers
+    NullabilityOfRet: O
+    Nullability: [ N, O ]
+  - Name: do_something_with_arrays
+    Parameters:
+      - Position: 0
+        Nullability: N
+      - Position: 1
+        Nullability: N
+  - Name: take_pointer_and_int
+    Parameters:
+      - Position: 0
+        Nullability: N
+        NoEscape: true
+      - Position: 1
+        NoEscape: true
+Globals:
+  - Name: global_int
+    Nullability: N
+  - Name: unavailable_global_int
+    Availability: none
+Tags:
+  - Name: unavailable_struct
+    Availability: none
+
+Typedefs:
+  - Name: unavailable_typedef
+    Availability: none
diff --git a/clang/test/APINotes/Inputs/Headers/HeaderLib.h b/clang/test/APINotes/Inputs/Headers/HeaderLib.h
new file mode 100644
index 00000000000000..8065249607851b
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/HeaderLib.h
@@ -0,0 +1,19 @@
+#ifndef HEADER_LIB_H
+#define HEADER_LIB_H
+
+void *custom_realloc(void *member, unsigned size);
+
+int *global_int;
+
+int unavailable_function(void);
+int unavailable_global_int;
+
+void do_something_with_pointers(int *ptr1, int *ptr2);
+void do_something_with_arrays(int simple[], int nested[][2]);
+
+typedef int unavailable_typedef;
+struct unavailable_struct { int x, y, z; };
+
+void take_pointer_and_int(int *ptr1, int value);
+
+#endif
diff --git a/clang/test/APINotes/Inputs/Headers/InstancetypeModule.apinotes b/clang/test/APINotes/Inputs/Headers/InstancetypeModule.apinotes
new file mode 100644
index 00000000000000..813eb506f39a74
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/InstancetypeModule.apinotes
@@ -0,0 +1,10 @@
+Name: InstancetypeModule
+Classes:
+- Name: SomeBaseClass
+  Methods:
+  - Selector: instancetypeFactoryMethod
+    MethodKind: Class
+    ResultType: SomeBaseClass * _Nonnull
+  - Selector: staticFactoryMethod
+    MethodKind: Class
+    ResultType: SomeBaseClass * _Nonnull
diff --git a/clang/test/APINotes/Inputs/Headers/InstancetypeModule.h b/clang/test/APINotes/Inputs/Headers/InstancetypeModule.h
new file mode 100644
index 00000000000000..767f201d9faf63
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/InstancetypeModule.h
@@ -0,0 +1,10 @@
+ at interface Object
+ at end
+
+ at interface SomeBaseClass : Object
++ (nullable instancetype)instancetypeFactoryMethod;
++ (nullable SomeBaseClass *)staticFactoryMethod;
+ at end
+
+ at interface SomeSubclass : SomeBaseClass
+ at end
diff --git a/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase.h b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase.h
new file mode 100644
index 00000000000000..867a15cae9a664
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase.h
@@ -0,0 +1 @@
+extern int ModuleWithWrongCase;
diff --git a/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate.h b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate.h
new file mode 100644
index 00000000000000..aa014296ca7d23
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate.h
@@ -0,0 +1 @@
+extern int ModuleWithWrongCasePrivate;
diff --git a/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate_Private.apinotes b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate_Private.apinotes
new file mode 100644
index 00000000000000..dc6dc50bab6e69
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCasePrivate_Private.apinotes
@@ -0,0 +1 @@
+Name: ModuleWithWrongCasePrivate
diff --git a/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase_Private.apinotes b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase_Private.apinotes
new file mode 100644
index 00000000000000..dc6dc50bab6e69
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/ModuleWithWrongCase_Private.apinotes
@@ -0,0 +1 @@
+Name: ModuleWithWrongCasePrivate
diff --git a/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes b/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
new file mode 100644
index 00000000000000..e9da36787b638d
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/Namespaces.apinotes
@@ -0,0 +1,53 @@
+---
+Name: Namespaces
+Globals:
+  - Name: varInInlineNamespace
+    SwiftName: swiftVarInInlineNamespace
+Functions:
+  - Name: funcInNamespace
+    SwiftName: inWrongContext()
+  - Name: funcInInlineNamespace
+    SwiftName: swiftFuncInInlineNamespace()
+Tags:
+  - Name: char_box
+    SwiftName: InWrongContext
+Namespaces:
+  - Name: Namespace1
+    Typedefs:
+      - Name: my_typedef
+        SwiftName: SwiftTypedef
+      - Name: my_using_decl
+        SwiftName: SwiftUsingDecl
+    Globals:
+      - Name: varInNamespace
+        SwiftName: swiftVarInNamespace
+    Functions:
+      - Name: funcInNamespace
+        SwiftName: swiftFuncInNamespace()
+    Tags:
+      - Name: char_box
+        SwiftName: CharBox
+    Namespaces:
+      - Name: Nested1
+        Globals:
+          - Name: varInNestedNamespace
+            SwiftName: swiftVarInNestedNamespace
+        Functions:
+          - Name: funcInNestedNamespace
+            SwiftName: swiftFuncInNestedNamespace(_:)
+        Tags:
+          - Name: char_box
+            SwiftName: NestedCharBox
+        Namespaces:
+          - Name: Namespace1
+            Tags:
+              - Name: char_box
+                SwiftName: DeepNestedCharBox
+      - Name: Nested2
+        Globals:
+          - Name: varInNestedNamespace
+            SwiftName: swiftAnotherVarInNestedNamespace
+  - Name: InlineNamespace1
+    Functions:
+      - Name: funcInInlineNamespace
+        SwiftName: shouldNotSpellOutInlineNamespaces()
diff --git a/clang/test/APINotes/Inputs/Headers/Namespaces.h b/clang/test/APINotes/Inputs/Headers/Namespaces.h
new file mode 100644
index 00000000000000..6a79e996be86cd
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/Namespaces.h
@@ -0,0 +1,39 @@
+namespace Namespace1 { namespace Nested1 {} }
+
+namespace Namespace1 {
+static int varInNamespace = 1;
+struct char_box { char c; };
+void funcInNamespace();
+
+namespace Nested1 {
+void funcInNestedNamespace(int i);
+struct char_box {
+  char c;
+};
+}
+
+namespace Nested1 {
+static int varInNestedNamespace = 1;
+void funcInNestedNamespace(int i);
+
+namespace Namespace1 {
+struct char_box { char c; };
+} // namespace Namespace1
+} // namespace Nested1
+
+namespace Nested2 {
+static int varInNestedNamespace = 2;
+} // namespace Nested2
+
+namespace Nested1 { namespace Namespace1 {} }
+} // namespace Namespace1
+
+namespace Namespace1 {
+typedef int my_typedef;
+using my_using_decl = int;
+}
+
+inline namespace InlineNamespace1 {
+static int varInInlineNamespace = 3;
+void funcInInlineNamespace();
+}
diff --git a/clang/test/APINotes/Inputs/Headers/PrivateLib.apinotes b/clang/test/APINotes/Inputs/Headers/PrivateLib.apinotes
new file mode 100644
index 00000000000000..5f62284aadcaf7
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/PrivateLib.apinotes
@@ -0,0 +1,4 @@
+Name: HeaderLib
+Globals:
+- Name: PrivateLib
+  Type: float
diff --git a/clang/test/APINotes/Inputs/Headers/PrivateLib.h b/clang/test/APINotes/Inputs/Headers/PrivateLib.h
new file mode 100644
index 00000000000000..59aeef09bdd3b6
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/PrivateLib.h
@@ -0,0 +1 @@
+extern int PrivateLib;
diff --git a/clang/test/APINotes/Inputs/Headers/PrivateLib_private.apinotes b/clang/test/APINotes/Inputs/Headers/PrivateLib_private.apinotes
new file mode 100644
index 00000000000000..908dae0e3b0b24
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/PrivateLib_private.apinotes
@@ -0,0 +1 @@
+garbage here because this file shouldn't get read
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
new file mode 100644
index 00000000000000..5dbb83cab86bd7
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
@@ -0,0 +1,9 @@
+---
+Name: SwiftImportAs
+Tags:
+- Name: ImmortalRefType
+  SwiftImportAs: reference
+- Name: RefCountedType
+  SwiftImportAs: reference
+  SwiftReleaseOp: RCRelease
+  SwiftRetainOp: RCRetain
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
new file mode 100644
index 00000000000000..82b8a6749c4fe2
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
@@ -0,0 +1,6 @@
+struct ImmortalRefType {};
+
+struct RefCountedType { int value; };
+
+inline void RCRetain(RefCountedType *x) { x->value++; }
+inline void RCRelease(RefCountedType *x) { x->value--; }
diff --git a/clang/test/APINotes/Inputs/Headers/module.modulemap b/clang/test/APINotes/Inputs/Headers/module.modulemap
new file mode 100644
index 00000000000000..98b4ee3e96cfe7
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/module.modulemap
@@ -0,0 +1,31 @@
+module ExternCtx {
+  header "ExternCtx.h"
+}
+
+module HeaderLib {
+  header "HeaderLib.h"
+}
+
+module InstancetypeModule {
+  header "InstancetypeModule.h"
+}
+
+module BrokenTypes {
+  header "BrokenTypes.h"
+}
+
+module ModuleWithWrongCase {
+  header "ModuleWithWrongCase.h"
+}
+
+module ModuleWithWrongCasePrivate {
+  header "ModuleWithWrongCasePrivate.h"
+}
+
+module Namespaces {
+  header "Namespaces.h"
+}
+
+module SwiftImportAs {
+  header "SwiftImportAs.h"
+}
diff --git a/clang/test/APINotes/Inputs/Headers/module.private.modulemap b/clang/test/APINotes/Inputs/Headers/module.private.modulemap
new file mode 100644
index 00000000000000..2ecf322ed18d9c
--- /dev/null
+++ b/clang/test/APINotes/Inputs/Headers/module.private.modulemap
@@ -0,0 +1,5 @@
+module PrivateLib {
+  header "PrivateLib.h"
+}
+
+module ModuleWithWrongCasePrivate.Inner {}
diff --git a/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.apinotes b/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.apinotes
new file mode 100644
index 00000000000000..77db844008990d
--- /dev/null
+++ b/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.apinotes
@@ -0,0 +1,65 @@
+---
+Name:            UIKit
+Classes:
+  - Name:            UIFont
+    Methods:
+      - Selector:        'fontWithName:size:'
+        MethodKind:      Instance
+        Nullability:     [ N ]
+        NullabilityOfRet: O
+        DesignatedInit:  true
+# CHECK: duplicate definition of method '-[UIFont fontWithName:size:]'
+      - Selector:        'fontWithName:size:'
+        MethodKind:      Instance
+        Nullability:     [ N ]
+        NullabilityOfRet: O
+        DesignatedInit:  true
+    Properties:
+      - Name:            familyName
+        Nullability:     N
+      - Name:            fontName
+        Nullability:     N
+# CHECK: duplicate definition of instance property 'UIFont.familyName'
+      - Name:            familyName
+        Nullability:     N
+# CHECK: multiple definitions of class 'UIFont'
+  - Name:            UIFont
+Protocols:
+  - Name:            MyProto
+    AuditedForNullability: true
+# CHECK: multiple definitions of protocol 'MyProto'
+  - Name:            MyProto
+    AuditedForNullability: true
+Functions:
+  - Name:        'globalFoo'
+    Nullability:     [ N, N, O, S ]
+    NullabilityOfRet: O
+  - Name:        'globalFoo2'
+    Nullability:     [ N, N, O, S ]
+    NullabilityOfRet: O
+Globals:
+  - Name:            globalVar
+    Nullability:     O
+  - Name:            globalVar2
+    Nullability:     O
+Tags:
+# CHECK: cannot mix EnumKind and FlagEnum (for FlagAndEnumKind)
+  - Name: FlagAndEnumKind
+    FlagEnum: true
+    EnumKind: CFOptions
+# CHECK: cannot mix EnumKind and FlagEnum (for FlagAndEnumKind2)
+  - Name: FlagAndEnumKind2
+    EnumKind: CFOptions
+    FlagEnum: false
+# CHECK: cannot mix EnumKind and EnumExtensibility (for ExtensibilityAndEnumKind)
+  - Name: ExtensibilityAndEnumKind
+    EnumExtensibility: open
+    EnumKind: CFOptions
+# CHECK: cannot mix EnumKind and EnumExtensibility (for ExtensibilityAndEnumKind2)
+  - Name: ExtensibilityAndEnumKind2
+    EnumKind: CFOptions
+    EnumExtensibility: closed
+# CHECK: cannot mix EnumKind and EnumExtensibility (for ExtensibilityAndEnumKind3)
+  - Name: ExtensibilityAndEnumKind3
+    EnumKind: none
+    EnumExtensibility: none
diff --git a/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.h b/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.h
new file mode 100644
index 00000000000000..55313ae260ae11
--- /dev/null
+++ b/clang/test/APINotes/Inputs/yaml-reader-errors/UIKit.h
@@ -0,0 +1 @@
+extern int yesOfCourseThisIsWhatUIKitLooksLike;
diff --git a/clang/test/APINotes/Inputs/yaml-reader-errors/module.modulemap b/clang/test/APINotes/Inputs/yaml-reader-errors/module.modulemap
new file mode 100644
index 00000000000000..3d683d705cacf8
--- /dev/null
+++ b/clang/test/APINotes/Inputs/yaml-reader-errors/module.modulemap
@@ -0,0 +1,3 @@
+module UIKit {
+  header "UIKit.h"
+}
diff --git a/clang/test/APINotes/availability.m b/clang/test/APINotes/availability.m
new file mode 100644
index 00000000000000..2ddc2a73da8046
--- /dev/null
+++ b/clang/test/APINotes/availability.m
@@ -0,0 +1,48 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -Wno-private-module -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+#include "HeaderLib.h"
+#import <SomeKit/SomeKit.h>
+#import <SomeKit/SomeKit_Private.h>
+
+int main() {
+  int i;
+  i = unavailable_function(); // expected-error{{'unavailable_function' is unavailable: I beg you not to use this}}
+  // expected-note at HeaderLib.h:8{{'unavailable_function' has been explicitly marked unavailable here}}
+  i = unavailable_global_int; // expected-error{{'unavailable_global_int' is unavailable}}
+  // expected-note at HeaderLib.h:9{{'unavailable_global_int' has been explicitly marked unavailable here}}
+
+  unavailable_typedef t; // expected-error{{'unavailable_typedef' is unavailable}}
+  // expected-note at HeaderLib.h:14{{'unavailable_typedef' has been explicitly marked unavailable here}}
+
+  struct unavailable_struct s; // expected-error{{'unavailable_struct' is unavailable}}
+  // expected-note at HeaderLib.h:15{{'unavailable_struct' has been explicitly marked unavailable here}}
+
+  B *b = 0; // expected-error{{'B' is unavailable: just don't}}
+  // expected-note at SomeKit/SomeKit.h:15{{'B' has been explicitly marked unavailable here}}
+
+  id<InternalProtocol> proto = 0; // expected-error{{'InternalProtocol' is unavailable: not for you}}
+  // expected-note at SomeKit/SomeKit_Private.h:12{{'InternalProtocol' has been explicitly marked unavailable here}}
+
+  A *a = 0;
+  i = a.intValue; // expected-error{{intValue' is unavailable: wouldn't work anyway}}
+  // expected-note at SomeKit/SomeKit.h:12{{'intValue' has been explicitly marked unavailable here}}
+
+  [a transform:a]; // expected-error{{'transform:' is unavailable: anything but this}}
+  // expected-note at SomeKit/SomeKit.h:6{{'transform:' has been explicitly marked unavailable here}}
+
+  [a implicitGetOnlyInstance]; // expected-error{{'implicitGetOnlyInstance' is unavailable: getter gone}}
+  // expected-note at SomeKit/SomeKit.h:53{{'implicitGetOnlyInstance' has been explicitly marked unavailable here}}
+  [A implicitGetOnlyClass]; // expected-error{{'implicitGetOnlyClass' is unavailable: getter gone}}
+  // expected-note at SomeKit/SomeKit.h:54{{'implicitGetOnlyClass' has been explicitly marked unavailable here}}
+  [a implicitGetSetInstance]; // expected-error{{'implicitGetSetInstance' is unavailable: getter gone}}
+  // expected-note at SomeKit/SomeKit.h:56{{'implicitGetSetInstance' has been explicitly marked unavailable here}}
+  [a setImplicitGetSetInstance: a];  // expected-error{{'setImplicitGetSetInstance:' is unavailable: setter gone}}
+  // expected-note at SomeKit/SomeKit.h:56{{'setImplicitGetSetInstance:' has been explicitly marked unavailable here}}
+  [A implicitGetSetClass]; // expected-error{{'implicitGetSetClass' is unavailable: getter gone}}
+  // expected-note at SomeKit/SomeKit.h:57{{'implicitGetSetClass' has been explicitly marked unavailable here}}
+  [A setImplicitGetSetClass: a];  // expected-error{{'setImplicitGetSetClass:' is unavailable: setter gone}}
+  // expected-note at SomeKit/SomeKit.h:57{{'setImplicitGetSetClass:' has been explicitly marked unavailable here}}
+  return 0;
+}
+
diff --git a/clang/test/APINotes/broken_types.m b/clang/test/APINotes/broken_types.m
new file mode 100644
index 00000000000000..ee33ff7c4b4b9c
--- /dev/null
+++ b/clang/test/APINotes/broken_types.m
@@ -0,0 +1,19 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s 2> %t.err
+// RUN: FileCheck %s < %t.err
+
+#include "BrokenTypes.h"
+
+// CHECK: <API Notes>:1:1: error: unknown type name 'not_a_type'
+// CHECK-NEXT: not_a_type
+// CHECK-NEXT: ^
+
+// CHECK: <API Notes>:1:7: error: unparsed tokens following type
+// CHECK-NEXT: int * with extra junk
+// CHECK-NEXT:       ^
+
+// CHECK: BrokenTypes.h:4:6: error: API notes replacement type 'int *' has a different size from original type 'char'
+
+// CHECK: BrokenTypes.h:6:13: error: API notes replacement type 'double' has a different size from original type 'char'
+
+// CHECK: 5 errors generated.
diff --git a/clang/test/APINotes/case-for-private-apinotes-file.c b/clang/test/APINotes/case-for-private-apinotes-file.c
new file mode 100644
index 00000000000000..6aff3db54918e4
--- /dev/null
+++ b/clang/test/APINotes/case-for-private-apinotes-file.c
@@ -0,0 +1,22 @@
+// REQUIRES: case-insensitive-filesystem
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fapinotes-modules -fimplicit-module-maps -fmodules-cache-path=%t -F %S/Inputs/Frameworks -I %S/Inputs/Headers %s 2>&1 | FileCheck %s
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fapinotes-modules -fimplicit-module-maps -fmodules-cache-path=%t -iframework %S/Inputs/Frameworks -isystem %S/Inputs/Headers %s -Werror
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fsyntax-only -fmodules -fapinotes-modules -fimplicit-module-maps -fmodules-cache-path=%t -iframework %S/Inputs/Frameworks -isystem %S/Inputs/Headers %s -Wnonportable-private-system-apinotes-path 2>&1 | FileCheck %s
+
+#include <ModuleWithWrongCase.h>
+#include <ModuleWithWrongCasePrivate.h>
+#include <FrameworkWithWrongCase/FrameworkWithWrongCase.h>
+#include <FrameworkWithWrongCasePrivate/FrameworkWithWrongCasePrivate.h>
+#include <FrameworkWithActualPrivateModule/FrameworkWithActualPrivateModule_Private.h>
+
+// CHECK-NOT: warning:
+// CHECK: warning: private API notes file for module 'ModuleWithWrongCasePrivate' should be named 'ModuleWithWrongCasePrivate_private.apinotes', not 'ModuleWithWrongCasePrivate_Private.apinotes'
+// CHECK-NOT: warning:
+// CHECK: warning: private API notes file for module 'FrameworkWithWrongCasePrivate' should be named 'FrameworkWithWrongCasePrivate_private.apinotes', not 'FrameworkWithWrongCasePrivate_Private.apinotes'
+// CHECK-NOT: warning:
diff --git a/clang/test/APINotes/extern-context.cpp b/clang/test/APINotes/extern-context.cpp
new file mode 100644
index 00000000000000..331dee002361c0
--- /dev/null
+++ b/clang/test/APINotes/extern-context.cpp
@@ -0,0 +1,23 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInExternC -x c++ | FileCheck -check-prefix=CHECK-EXTERN-C %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalInExternCXX -x c++ | FileCheck -check-prefix=CHECK-EXTERN-CXX %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalFuncInExternC -x c++ | FileCheck -check-prefix=CHECK-FUNC-EXTERN-C %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -ast-dump -ast-dump-filter globalFuncInExternCXX -x c++ | FileCheck -check-prefix=CHECK-FUNC-EXTERN-CXX %s
+
+#include "ExternCtx.h"
+
+// CHECK-EXTERN-C: Dumping globalInExternC:
+// CHECK-EXTERN-C: VarDecl {{.+}} imported in ExternCtx globalInExternC 'int'
+// CHECK-EXTERN-C: UnavailableAttr {{.+}} <<invalid sloc>> "oh no"
+
+// CHECK-EXTERN-CXX: Dumping globalInExternCXX:
+// CHECK-EXTERN-CXX: VarDecl {{.+}} imported in ExternCtx globalInExternCXX 'int'
+// CHECK-EXTERN-CXX: UnavailableAttr {{.+}} <<invalid sloc>> "oh no #2"
+
+// CHECK-FUNC-EXTERN-C: Dumping globalFuncInExternC:
+// CHECK-FUNC-EXTERN-C: FunctionDecl {{.+}} imported in ExternCtx globalFuncInExternC 'void ()'
+// CHECK-FUNC-EXTERN-C: UnavailableAttr {{.+}} <<invalid sloc>> "oh no #3"
+
+// CHECK-FUNC-EXTERN-CXX: Dumping globalFuncInExternCXX:
+// CHECK-FUNC-EXTERN-CXX: FunctionDecl {{.+}} imported in ExternCtx globalFuncInExternCXX 'void ()'
+// CHECK-FUNC-EXTERN-CXX: UnavailableAttr {{.+}} <<invalid sloc>> "oh no #4"
diff --git a/clang/test/APINotes/instancetype.m b/clang/test/APINotes/instancetype.m
new file mode 100644
index 00000000000000..30339e5386f634
--- /dev/null
+++ b/clang/test/APINotes/instancetype.m
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -verify %s
+
+ at import InstancetypeModule;
+
+void test() {
+  // The nullability is here to verify that the API notes were applied.
+  int good = [SomeSubclass instancetypeFactoryMethod]; // expected-error {{initializing 'int' with an expression of type 'SomeSubclass * _Nonnull'}}
+  int bad = [SomeSubclass staticFactoryMethod]; // expected-error {{initializing 'int' with an expression of type 'SomeBaseClass * _Nonnull'}}
+}
diff --git a/clang/test/APINotes/module-cache.m b/clang/test/APINotes/module-cache.m
new file mode 100644
index 00000000000000..5dcaf1181f9dcf
--- /dev/null
+++ b/clang/test/APINotes/module-cache.m
@@ -0,0 +1,65 @@
+// RUN: rm -rf %t
+
+// Set up directories
+// RUN: mkdir -p %t/APINotes
+// RUN: cp %S/Inputs/APINotes/SomeOtherKit.apinotes %t/APINotes/SomeOtherKit.apinotes
+// RUN: mkdir -p %t/Frameworks
+// RUN: cp -r %S/Inputs/Frameworks/SomeOtherKit.framework %t/Frameworks
+
+// First build: check that 'methodB' is unavailable but 'methodA' is available.
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/before.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-REBUILD %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-ONE-ERROR %s < %t/before.log
+
+// Do it again; now we're using caches.
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/before.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-WITHOUT-REBUILD %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-ONE-ERROR %s < %t/before.log
+
+// Add a blank line to the header to force the module to rebuild, without
+// (yet) changing API notes.
+// RUN: echo >> %t/Frameworks/SomeOtherKit.framework/Headers/SomeOtherKit.h
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/before.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-REBUILD %s < %t/before.log
+// RUN: FileCheck -check-prefix=CHECK-ONE-ERROR %s < %t/before.log
+
+// Change the API notes file, after the module has rebuilt once.
+// RUN: echo '      - Selector: "methodA"' >> %t/APINotes/SomeOtherKit.apinotes
+// RUN: echo '        MethodKind: Instance' >> %t/APINotes/SomeOtherKit.apinotes
+// RUN: echo '        Availability: none' >> %t/APINotes/SomeOtherKit.apinotes
+// RUN: echo '        AvailabilityMsg: "not here either"' >> %t/APINotes/SomeOtherKit.apinotes
+
+// Build again: check that both methods are now unavailable and that the module rebuilt.
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/after.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODA %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-REBUILD %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-TWO-ERRORS %s < %t/after.log
+
+// Run the build again: check that both methods are now unavailable
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -Rmodule-build -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %t/APINotes  -F %t/Frameworks %s > %t/after.log 2>&1
+// RUN: FileCheck -check-prefix=CHECK-METHODA %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-METHODB %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-WITHOUT-REBUILD %s < %t/after.log
+// RUN: FileCheck -check-prefix=CHECK-TWO-ERRORS %s < %t/after.log
+
+ at import SomeOtherKit;
+
+void test(A *a) {
+  // CHECK-METHODA: error: 'methodA' is unavailable: not here either
+  [a methodA];
+
+  // CHECK-METHODB: error: 'methodB' is unavailable: anything but this
+  [a methodB];
+}
+
+// CHECK-REBUILD: remark: building module{{.*}}SomeOtherKit
+
+// CHECK-WITHOUT-REBUILD-NOT: remark: building module{{.*}}SomeOtherKit
+
+// CHECK-ONE-ERROR: 1 error generated.
+// CHECK-TWO-ERRORS: 2 errors generated.
+
diff --git a/clang/test/APINotes/namespaces.cpp b/clang/test/APINotes/namespaces.cpp
new file mode 100644
index 00000000000000..2f9d93c2ea0e5a
--- /dev/null
+++ b/clang/test/APINotes/namespaces.cpp
@@ -0,0 +1,69 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -x objective-c++
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::my_typedef -x objective-c++ | FileCheck -check-prefix=CHECK-TYPEDEF-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::my_using_decl -x objective-c++ | FileCheck -check-prefix=CHECK-USING-DECL-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::varInNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::funcInNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested2::varInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::funcInNestedNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Namespace1::Nested1::Namespace1::char_box -x objective-c++ | FileCheck -check-prefix=CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter varInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-GLOBAL-IN-INLINE-NAMESPACE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/CxxInterop -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter funcInInlineNamespace -x objective-c++ | FileCheck -check-prefix=CHECK-FUNC-IN-INLINE-NAMESPACE %s
+
+#import <Namespaces.h>
+
+// CHECK-TYPEDEF-IN-NAMESPACE: Dumping Namespace1::my_typedef:
+// CHECK-TYPEDEF-IN-NAMESPACE-NEXT: TypedefDecl {{.+}} imported in Namespaces my_typedef 'int'
+// CHECK-TYPEDEF-IN-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "SwiftTypedef"
+
+// CHECK-USING-DECL-IN-NAMESPACE: Dumping Namespace1::my_using_decl:
+// CHECK-USING-DECL-IN-NAMESPACE-NEXT: TypeAliasDecl {{.+}} imported in Namespaces my_using_decl 'int'
+// CHECK-USING-DECL-IN-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "SwiftUsingDecl"
+
+// CHECK-GLOBAL-IN-NAMESPACE: Dumping Namespace1::varInNamespace:
+// CHECK-GLOBAL-IN-NAMESPACE-NEXT: VarDecl {{.+}} imported in Namespaces varInNamespace 'int' static cinit
+// CHECK-GLOBAL-IN-NAMESPACE-NEXT: IntegerLiteral {{.+}} 'int' 1
+// CHECK-GLOBAL-IN-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftVarInNamespace"
+
+// CHECK-FUNC-IN-NAMESPACE: Dumping Namespace1::funcInNamespace:
+// CHECK-FUNC-IN-NAMESPACE-NEXT: FunctionDecl {{.+}} imported in Namespaces funcInNamespace 'void ()'
+// CHECK-FUNC-IN-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftFuncInNamespace()"
+
+// CHECK-STRUCT-IN-NAMESPACE: Dumping Namespace1::char_box:
+// CHECK-STRUCT-IN-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct char_box
+// CHECK-STRUCT-IN-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "CharBox"
+
+// CHECK-GLOBAL-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::varInNestedNamespace:
+// CHECK-GLOBAL-IN-NESTED-NAMESPACE-NEXT: VarDecl {{.+}} imported in Namespaces varInNestedNamespace 'int' static cinit
+// CHECK-GLOBAL-IN-NESTED-NAMESPACE-NEXT: IntegerLiteral {{.+}} 'int' 1
+// CHECK-GLOBAL-IN-NESTED-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftVarInNestedNamespace"
+
+// CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested2::varInNestedNamespace:
+// CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE-NEXT: VarDecl {{.+}} imported in Namespaces varInNestedNamespace 'int' static cinit
+// CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE-NEXT: IntegerLiteral {{.+}} 'int' 2
+// CHECK-ANOTHER-GLOBAL-IN-NESTED-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftAnotherVarInNestedNamespace"
+
+// CHECK-FUNC-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::funcInNestedNamespace:
+// CHECK-FUNC-IN-NESTED-NAMESPACE-NEXT: FunctionDecl {{.+}} imported in Namespaces funcInNestedNamespace 'void (int)'
+// CHECK-FUNC-IN-NESTED-NAMESPACE-NEXT: ParmVarDecl {{.+}} i 'int'
+// CHECK-FUNC-IN-NESTED-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftFuncInNestedNamespace(_:)"
+
+// CHECK-STRUCT-IN-NESTED-NAMESPACE: Dumping Namespace1::Nested1::char_box:
+// CHECK-STRUCT-IN-NESTED-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct char_box
+// CHECK-STRUCT-IN-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "NestedCharBox"
+
+// CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE: Dumping Namespace1::Nested1::Namespace1::char_box:
+// CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE-NEXT: CXXRecordDecl {{.+}} imported in Namespaces <undeserialized declarations> struct char_box
+// CHECK-STRUCT-IN-DEEP-NESTED-NAMESPACE: SwiftNameAttr {{.+}} <<invalid sloc>> "DeepNestedCharBox"
+
+// CHECK-GLOBAL-IN-INLINE-NAMESPACE: Dumping varInInlineNamespace:
+// CHECK-GLOBAL-IN-INLINE-NAMESPACE-NEXT: VarDecl {{.+}} imported in Namespaces varInInlineNamespace 'int' static cinit
+// CHECK-GLOBAL-IN-INLINE-NAMESPACE-NEXT: IntegerLiteral {{.+}} 'int' 3
+// CHECK-GLOBAL-IN-INLINE-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftVarInInlineNamespace"
+
+// CHECK-FUNC-IN-INLINE-NAMESPACE: Dumping funcInInlineNamespace:
+// CHECK-FUNC-IN-INLINE-NAMESPACE-NEXT: FunctionDecl {{.+}} imported in Namespaces funcInInlineNamespace 'void ()'
+// CHECK-FUNC-IN-INLINE-NAMESPACE-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "swiftFuncInInlineNamespace()"
diff --git a/clang/test/APINotes/nullability.c b/clang/test/APINotes/nullability.c
new file mode 100644
index 00000000000000..e07fc2e5c11743
--- /dev/null
+++ b/clang/test/APINotes/nullability.c
@@ -0,0 +1,21 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+#include "HeaderLib.h"
+
+int main() {
+  custom_realloc(0, 0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+  int i = 0;
+  do_something_with_pointers(&i, 0);
+  do_something_with_pointers(0, &i); // expected-warning{{null passed to a callee that requires a non-null argument}}
+  
+  extern void *p;
+  do_something_with_arrays(0, p); // expected-warning{{null passed to a callee that requires a non-null argument}}
+  do_something_with_arrays(p, 0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+  take_pointer_and_int(0, 0); // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+  float *fp = global_int; // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'int * _Nonnull'}}
+  return 0;
+}
+
diff --git a/clang/test/APINotes/nullability.m b/clang/test/APINotes/nullability.m
new file mode 100644
index 00000000000000..21ec6680fa714d
--- /dev/null
+++ b/clang/test/APINotes/nullability.m
@@ -0,0 +1,46 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+// Test with Swift version 3.0. This should only affect the few APIs that have an entry in the 3.0 tables.
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fapinotes-swift-version=3.0 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify -DSWIFT_VERSION_3_0 -fmodules-ignore-macro=SWIFT_VERSION_3_0
+
+#import <SomeKit/SomeKit.h>
+
+int main() {
+  A *a;
+
+#if SWIFT_VERSION_3_0
+  float *fp =  // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'A * _Nullable'}}
+    [a transform: 0 integer: 0];
+#else
+  float *fp =  // expected-warning{{incompatible pointer types initializing 'float *' with an expression of type 'A *'}}
+    [a transform: 0 integer: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+#endif
+
+  [a setNonnullAInstance: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+  [A setNonnullAInstance: 0]; // no warning
+  a.nonnullAInstance = 0; // expected-warning{{null passed to a callee that requires a non-null argument}}
+  A* _Nonnull aPtr = a.nonnullAInstance; // no warning
+
+  [a setNonnullAClass: 0]; // no warning
+  [A setNonnullAClass: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+  [a setNonnullABoth: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+  [A setNonnullABoth: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+  [a setInternalProperty: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+
+#if SWIFT_VERSION_3_0
+  // Version 3 information overrides header information.
+  [a setExplicitNonnullInstance: 0]; //  okay
+  [a setExplicitNullableInstance: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+#else
+  // Header information overrides unversioned information.
+  [a setExplicitNonnullInstance: 0]; // expected-warning{{null passed to a callee that requires a non-null argument}}
+  [a setExplicitNullableInstance: 0]; // okay
+#endif
+
+  return 0;
+}
+
diff --git a/clang/test/APINotes/objc-forward-declarations.m b/clang/test/APINotes/objc-forward-declarations.m
new file mode 100644
index 00000000000000..e82bed20555049
--- /dev/null
+++ b/clang/test/APINotes/objc-forward-declarations.m
@@ -0,0 +1,12 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -F %S/Inputs/Frameworks %s -verify
+
+ at import LayeredKit;
+
+void test(
+  UpwardClass *okayClass,
+  id <UpwardProto> okayProto,
+  PerfectlyNormalClass *badClass // expected-error {{'PerfectlyNormalClass' is unavailable}}
+) {
+  // expected-note at LayeredKitImpl/LayeredKitImpl.h:4 {{'PerfectlyNormalClass' has been explicitly marked unavailable here}}
+}
diff --git a/clang/test/APINotes/objc_designated_inits.m b/clang/test/APINotes/objc_designated_inits.m
new file mode 100644
index 00000000000000..1f2b8ed534b7a1
--- /dev/null
+++ b/clang/test/APINotes/objc_designated_inits.m
@@ -0,0 +1,17 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+#include "HeaderLib.h"
+#import <SomeKit/SomeKit.h>
+
+ at interface CSub : C
+-(instancetype)initWithA:(A*)a;
+ at end
+
+ at implementation CSub
+-(instancetype)initWithA:(A*)a { // expected-warning{{designated initializer missing a 'super' call to a designated initializer of the super class}}
+  // expected-note at SomeKit/SomeKit.h:20 2{{method marked as designated initializer of the class here}}
+  self = [super init]; // expected-warning{{designated initializer invoked a non-designated initializer}}
+  return self;
+}
+ at end
diff --git a/clang/test/APINotes/properties.m b/clang/test/APINotes/properties.m
new file mode 100644
index 00000000000000..f218092a66e1dc
--- /dev/null
+++ b/clang/test/APINotes/properties.m
@@ -0,0 +1,42 @@
+// RUN: rm -rf %t && mkdir -p %t
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fblocks -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'TestProperties::' | FileCheck -check-prefix=CHECK -check-prefix=CHECK-4 %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fblocks -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'TestProperties::' -fapinotes-swift-version=3 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-3 %s
+
+ at import VersionedKit;
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnly 'id'
+// CHECK-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyForClass 'id'
+// CHECK-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyInVersion3 'id'
+// CHECK-3-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftVersionedAdditionAttr {{.+}} 3.0{{$}}
+// CHECK-4-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyForClassInVersion3 'id'
+// CHECK-3-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftVersionedAdditionAttr {{.+}} 3.0{{$}}
+// CHECK-4-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyExceptInVersion3 'id'
+// CHECK-3-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0 IsReplacedByActive{{$}}
+// CHECK-3-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftVersionedRemovalAttr {{.+}} Implicit 3.0 {{[0-9]+}}
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: ObjCPropertyDecl {{.+}} accessorsOnlyForClassExceptInVersion3 'id'
+// CHECK-3-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0 IsReplacedByActive{{$}}
+// CHECK-3-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftImportPropertyAsAccessorsAttr {{.+}} <<invalid sloc>>
+// CHECK-4-NEXT: SwiftVersionedRemovalAttr {{.+}} Implicit 3.0 {{[0-9]+}}
+// CHECK-NOT: Attr
+
+// CHECK-LABEL: Decl
diff --git a/clang/test/APINotes/retain-count-convention.m b/clang/test/APINotes/retain-count-convention.m
new file mode 100644
index 00000000000000..4bf9610a352a75
--- /dev/null
+++ b/clang/test/APINotes/retain-count-convention.m
@@ -0,0 +1,38 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules  -fdisable-module-hash -fsyntax-only -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/SimpleKit.pcm | FileCheck %s
+// RUN: %clang_cc1 -ast-dump -ast-dump-filter 'DUMP' %t/ModulesCache/SimpleKit.pcm | FileCheck -check-prefix CHECK-DUMP %s
+
+#import <SimpleKit/SimpleKit.h>
+
+// CHECK: void *getCFOwnedToUnowned(void) __attribute__((cf_returns_not_retained));
+// CHECK: void *getCFUnownedToOwned(void) __attribute__((cf_returns_retained));
+// CHECK: void *getCFOwnedToNone(void) __attribute__((cf_unknown_transfer));
+// CHECK: id getObjCOwnedToUnowned(void) __attribute__((ns_returns_not_retained));
+// CHECK: id getObjCUnownedToOwned(void) __attribute__((ns_returns_retained));
+// CHECK: int indirectGetCFOwnedToUnowned(void * _Nullable *out __attribute__((cf_returns_not_retained)));
+// CHECK: int indirectGetCFUnownedToOwned(void * _Nullable *out __attribute__((cf_returns_retained)));
+// CHECK: int indirectGetCFOwnedToNone(void * _Nullable *out);
+// CHECK: int indirectGetCFNoneToOwned(void **out __attribute__((cf_returns_not_retained)));
+
+// CHECK-LABEL: @interface MethodTest
+// CHECK: - (id)getOwnedToUnowned __attribute__((ns_returns_not_retained));
+// CHECK: - (id)getUnownedToOwned __attribute__((ns_returns_retained));
+// CHECK: @end
+
+// CHECK-DUMP-LABEL: Dumping getCFAuditedToUnowned_DUMP:
+// CHECK-DUMP-NEXT: FunctionDecl
+// CHECK-DUMP-NEXT: CFReturnsNotRetainedAttr
+// CHECK-DUMP-NEXT: CFAuditedTransferAttr
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping getCFAuditedToOwned_DUMP:
+// CHECK-DUMP-NEXT: FunctionDecl
+// CHECK-DUMP-NEXT: CFReturnsRetainedAttr
+// CHECK-DUMP-NEXT: CFAuditedTransferAttr
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping getCFAuditedToNone_DUMP:
+// CHECK-DUMP-NEXT: FunctionDecl
+// CHECK-DUMP-NEXT: CFUnknownTransferAttr
+// CHECK-DUMP-NOT: Attr
diff --git a/clang/test/APINotes/search-order.m b/clang/test/APINotes/search-order.m
new file mode 100644
index 00000000000000..17e81d5eb2d691
--- /dev/null
+++ b/clang/test/APINotes/search-order.m
@@ -0,0 +1,25 @@
+// RUN: rm -rf %t && mkdir -p %t
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -DFROM_FRAMEWORK=1 -verify
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -iapinotes-modules %S/Inputs/APINotes  -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -DFROM_SEARCH_PATH=1 -verify
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -iapinotes-modules %S/Inputs/APINotes  -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -DFROM_FRAMEWORK=1 -verify
+
+ at import SomeOtherKit;
+
+void test(A *a) {
+#if FROM_FRAMEWORK
+  [a methodA]; // expected-error{{unavailable}}
+  [a methodB];
+
+  // expected-note at SomeOtherKit/SomeOtherKit.h:5{{'methodA' has been explicitly marked unavailable here}}
+#elif FROM_SEARCH_PATH
+  [a methodA];
+  [a methodB]; // expected-error{{unavailable}}
+
+  // expected-note at SomeOtherKit/SomeOtherKit.h:6{{'methodB' has been explicitly marked unavailable here}}
+#else
+#  error Not something we need to test
+#endif
+}
diff --git a/clang/test/APINotes/swift-import-as.cpp b/clang/test/APINotes/swift-import-as.cpp
new file mode 100644
index 00000000000000..904857e5859303
--- /dev/null
+++ b/clang/test/APINotes/swift-import-as.cpp
@@ -0,0 +1,16 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter ImmortalRefType | FileCheck -check-prefix=CHECK-IMMORTAL %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter RefCountedType | FileCheck -check-prefix=CHECK-REF-COUNTED %s
+
+#include <SwiftImportAs.h>
+
+// CHECK-IMMORTAL: Dumping ImmortalRefType:
+// CHECK-IMMORTAL-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct ImmortalRefType
+// CHECK-IMMORTAL: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_reference"
+
+// CHECK-REF-COUNTED: Dumping RefCountedType:
+// CHECK-REF-COUNTED-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct RefCountedType
+// CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_reference"
+// CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "retain:RCRetain"
+// CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "release:RCRelease"
diff --git a/clang/test/APINotes/top-level-private-modules.c b/clang/test/APINotes/top-level-private-modules.c
new file mode 100644
index 00000000000000..0da72b2e36f4f2
--- /dev/null
+++ b/clang/test/APINotes/top-level-private-modules.c
@@ -0,0 +1,8 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+
+#include <PrivateLib.h>
+#include <TopLevelPrivateKit/TopLevelPrivateKit_Private.h>
+
+void *testPlain = PrivateLib; // expected-error {{initializing 'void *' with an expression of incompatible type 'float'}}
+void *testFramework = TopLevelPrivateKit_Private; // expected-error {{initializing 'void *' with an expression of incompatible type 'float'}}
diff --git a/clang/test/APINotes/types.m b/clang/test/APINotes/types.m
new file mode 100644
index 00000000000000..133d504713d76c
--- /dev/null
+++ b/clang/test/APINotes/types.m
@@ -0,0 +1,28 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fapinotes-modules -Wno-private-module -fdisable-module-hash -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -verify
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/SimpleKit.pcm | FileCheck %s
+
+#import <SomeKit/SomeKit.h>
+#import <SimpleKit/SimpleKit.h>
+
+// CHECK: struct __attribute__((swift_name("SuccessfullyRenamedA"))) RenamedAgainInAPINotesA {
+// CHECK: struct __attribute__((swift_name("SuccessfullyRenamedB"))) RenamedAgainInAPINotesB {
+
+void test(OverriddenTypes *overridden) {
+  int *ip1 = global_int_ptr; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'double (*)(int, int)'}}
+
+  int *ip2 = global_int_fun( // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'char *'}}
+               ip2, // expected-warning{{incompatible pointer types passing 'int *' to parameter of type 'double *'}}
+               ip2); // expected-warning{{incompatible pointer types passing 'int *' to parameter of type 'float *'}}
+
+  int *ip3 = [overridden // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'char *'}}
+                methodToMangle: ip3 // expected-warning{{incompatible pointer types sending 'int *' to parameter of type 'double *'}}
+                        second: ip3]; // expected-warning{{incompatible pointer types sending 'int *' to parameter of type 'float *'}}
+
+  int *ip4 = overridden.intPropertyToMangle; // expected-warning{{incompatible pointer types initializing 'int *' with an expression of type 'double *'}}
+}
+
+// expected-note at SomeKit/SomeKit.h:42{{passing argument to parameter 'ptr' here}}
+// expected-note at SomeKit/SomeKit.h:42{{passing argument to parameter 'ptr2' here}}
+// expected-note at SomeKit/SomeKit.h:48{{passing argument to parameter 'ptr1' here}}
+// expected-note at SomeKit/SomeKit.h:48{{passing argument to parameter 'ptr2' here}}
diff --git a/clang/test/APINotes/versioned-multi.c b/clang/test/APINotes/versioned-multi.c
new file mode 100644
index 00000000000000..48c51fd932e17c
--- /dev/null
+++ b/clang/test/APINotes/versioned-multi.c
@@ -0,0 +1,69 @@
+// RUN: rm -rf %t && mkdir -p %t
+
+// Build and check the unversioned module file.
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Unversioned -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Unversioned/VersionedKit.pcm | FileCheck -check-prefix=CHECK-UNVERSIONED %s
+
+// Build and check the various versions.
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned3 -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=3 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Versioned3/VersionedKit.pcm | FileCheck -check-prefix=CHECK-VERSIONED-3 %s
+
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned4 -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=4 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Versioned4/VersionedKit.pcm | FileCheck -check-prefix=CHECK-VERSIONED-4 %s
+
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned5 -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=5 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Versioned5/VersionedKit.pcm | FileCheck -check-prefix=CHECK-VERSIONED-5 %s
+
+#import <VersionedKit/VersionedKit.h>
+
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef4;
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef4Notes __attribute__((swift_name("MultiVersionedTypedef4Notes_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef34;
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef34Notes __attribute__((swift_name("MultiVersionedTypedef34Notes_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef45;
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef45Notes __attribute__((swift_name("MultiVersionedTypedef45Notes_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef345;
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef345Notes __attribute__((swift_name("MultiVersionedTypedef345Notes_NEW")));
+// CHECK-UNVERSIONED: typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_NEW")));
+
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef4 __attribute__((swift_name("MultiVersionedTypedef4_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef4Notes __attribute__((swift_name("MultiVersionedTypedef4Notes_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef34 __attribute__((swift_name("MultiVersionedTypedef34_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef34Notes __attribute__((swift_name("MultiVersionedTypedef34Notes_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef45 __attribute__((swift_name("MultiVersionedTypedef45_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef45Notes __attribute__((swift_name("MultiVersionedTypedef45Notes_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_4")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef345 __attribute__((swift_name("MultiVersionedTypedef345_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef345Notes __attribute__((swift_name("MultiVersionedTypedef345Notes_3")));
+// CHECK-VERSIONED-3: typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_3")));
+
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef4 __attribute__((swift_name("MultiVersionedTypedef4_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef4Notes __attribute__((swift_name("MultiVersionedTypedef4Notes_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef34 __attribute__((swift_name("MultiVersionedTypedef34_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef34Notes __attribute__((swift_name("MultiVersionedTypedef34Notes_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef45 __attribute__((swift_name("MultiVersionedTypedef45_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef45Notes __attribute__((swift_name("MultiVersionedTypedef45Notes_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef345 __attribute__((swift_name("MultiVersionedTypedef345_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef345Notes __attribute__((swift_name("MultiVersionedTypedef345Notes_4")));
+// CHECK-VERSIONED-4: typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_4")));
+
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef4;
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef4Notes __attribute__((swift_name("MultiVersionedTypedef4Notes_NEW")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef4Header __attribute__((swift_name("MultiVersionedTypedef4Header_NEW")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef34;
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef34Notes __attribute__((swift_name("MultiVersionedTypedef34Notes_NEW")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef34Header __attribute__((swift_name("MultiVersionedTypedef34Header_NEW")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef45 __attribute__((swift_name("MultiVersionedTypedef45_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef45Notes __attribute__((swift_name("MultiVersionedTypedef45Notes_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef45Header __attribute__((swift_name("MultiVersionedTypedef45Header_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef345 __attribute__((swift_name("MultiVersionedTypedef345_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef345Notes __attribute__((swift_name("MultiVersionedTypedef345Notes_5")));
+// CHECK-VERSIONED-5: typedef int MultiVersionedTypedef345Header __attribute__((swift_name("MultiVersionedTypedef345Header_5")));
diff --git a/clang/test/APINotes/versioned.m b/clang/test/APINotes/versioned.m
new file mode 100644
index 00000000000000..61cc8c3f7c4d1e
--- /dev/null
+++ b/clang/test/APINotes/versioned.m
@@ -0,0 +1,187 @@
+// RUN: rm -rf %t && mkdir -p %t
+
+// Build and check the unversioned module file.
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Unversioned -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Unversioned/VersionedKit.pcm | FileCheck -check-prefix=CHECK-UNVERSIONED %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Unversioned -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'DUMP' | FileCheck -check-prefix=CHECK-DUMP -check-prefix=CHECK-UNVERSIONED-DUMP %s
+
+// Build and check the versioned module file.
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=3 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s
+// RUN: %clang_cc1 -ast-print %t/ModulesCache/Versioned/VersionedKit.pcm | FileCheck -check-prefix=CHECK-VERSIONED %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Versioned -fdisable-module-hash -fapinotes-modules -fapinotes-swift-version=3 -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter 'DUMP' | FileCheck -check-prefix=CHECK-DUMP -check-prefix=CHECK-VERSIONED-DUMP %s
+
+#import <VersionedKit/VersionedKit.h>
+
+// CHECK-UNVERSIONED: void moveToPointDUMP(double x, double y) __attribute__((swift_name("moveTo(x:y:)")));
+// CHECK-VERSIONED: void moveToPointDUMP(double x, double y) __attribute__((swift_name("moveTo(a:b:)")));
+
+// CHECK-DUMP-LABEL: Dumping moveToPointDUMP
+// CHECK-VERSIONED-DUMP: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0 IsReplacedByActive{{$}}
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "moveTo(x:y:)"
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "moveTo(a:b:)"
+// CHECK-UNVERSIONED-DUMP: SwiftNameAttr {{.+}} "moveTo(x:y:)"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "moveTo(a:b:)"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping unversionedRenameDUMP
+// CHECK-DUMP: in VersionedKit unversionedRenameDUMP
+// CHECK-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 0 IsReplacedByActive{{$}}
+// CHECK-DUMP-NEXT: SwiftNameAttr {{.+}} "unversionedRename_HEADER()"
+// CHECK-DUMP-NEXT: SwiftNameAttr {{.+}} "unversionedRename_NOTES()"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping TestGenericDUMP
+// CHECK-VERSIONED-DUMP: SwiftImportAsNonGenericAttr {{.+}} <<invalid sloc>>
+// CHECK-UNVERSIONED-DUMP: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftImportAsNonGenericAttr {{.+}} <<invalid sloc>>
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping Swift3RenamedOnlyDUMP
+// CHECK-DUMP: in VersionedKit Swift3RenamedOnlyDUMP
+// CHECK-VERSIONED-DUMP-NEXT: SwiftVersionedRemovalAttr {{.+}} Implicit 3.0 {{[0-9]+}} IsReplacedByActive{{$}}
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "SpecialSwift3Name"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "SpecialSwift3Name"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping Swift3RenamedAlsoDUMP
+// CHECK-DUMP: in VersionedKit Swift3RenamedAlsoDUMP
+// CHECK-VERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0 IsReplacedByActive{{$}}
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <line:{{.+}}, col:{{.+}}> "Swift4Name"
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "SpecialSwift3Also"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <line:{{.+}}, col:{{.+}}> "Swift4Name"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 3.0{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "SpecialSwift3Also"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-LABEL: Dumping Swift4RenamedDUMP
+// CHECK-DUMP: in VersionedKit Swift4RenamedDUMP
+// CHECK-VERSIONED-DUMP-NEXT: SwiftVersionedRemovalAttr {{.+}} Implicit 4 {{[0-9]+}} IsReplacedByActive{{$}}
+// CHECK-VERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} "SpecialSwift4Name"
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftVersionedAdditionAttr {{.+}} Implicit 4{{$}}
+// CHECK-UNVERSIONED-DUMP-NEXT: SwiftNameAttr {{.+}} <<invalid sloc>> "SpecialSwift4Name"
+// CHECK-DUMP-NOT: Attr
+
+// CHECK-DUMP-NOT: Dumping
+
+// CHECK-UNVERSIONED: void acceptClosure(void (^block)(void) __attribute__((noescape)));
+// CHECK-VERSIONED: void acceptClosure(void (^block)(void));
+
+// CHECK-UNVERSIONED: void privateFunc(void) __attribute__((swift_private));
+
+// CHECK-UNVERSIONED: typedef double MyDoubleWrapper __attribute__((swift_wrapper("struct")));
+
+// CHECK-UNVERSIONED:      enum __attribute__((ns_error_domain(MyErrorDomain))) MyErrorCode {
+// CHECK-UNVERSIONED-NEXT:     MyErrorCodeFailed = 1
+// CHECK-UNVERSIONED-NEXT: };
+
+// CHECK-UNVERSIONED: __attribute__((swift_bridge("MyValueType")))
+// CHECK-UNVERSIONED: @interface MyReferenceType
+
+// CHECK-VERSIONED: void privateFunc(void);
+
+// CHECK-VERSIONED: typedef double MyDoubleWrapper;
+
+// CHECK-VERSIONED:      enum MyErrorCode {
+// CHECK-VERSIONED-NEXT:     MyErrorCodeFailed = 1
+// CHECK-VERSIONED-NEXT: };
+
+// CHECK-VERSIONED-NOT: __attribute__((swift_bridge("MyValueType")))
+// CHECK-VERSIONED: @interface MyReferenceType
+
+// CHECK-UNVERSIONED: __attribute__((swift_objc_members)
+// CHECK-UNVERSIONED-NEXT: @interface TestProperties
+// CHECK-VERSIONED-NOT: __attribute__((swift_objc_members)
+// CHECK-VERSIONED: @interface TestProperties
+
+// CHECK-UNVERSIONED-LABEL: enum __attribute__((flag_enum)) FlagEnum {
+// CHECK-UNVERSIONED-NEXT:     FlagEnumA = 1,
+// CHECK-UNVERSIONED-NEXT:     FlagEnumB = 2
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum __attribute__((flag_enum)) NewlyFlagEnum {
+// CHECK-UNVERSIONED-NEXT:     NewlyFlagEnumA = 1,
+// CHECK-UNVERSIONED-NEXT:     NewlyFlagEnumB = 2
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum __attribute__((flag_enum)) APINotedFlagEnum {
+// CHECK-UNVERSIONED-NEXT:     APINotedFlagEnumA = 1,
+// CHECK-UNVERSIONED-NEXT:     APINotedFlagEnumB = 2
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) OpenEnum {
+// CHECK-UNVERSIONED-NEXT:     OpenEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) NewlyOpenEnum {
+// CHECK-UNVERSIONED-NEXT:     NewlyOpenEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) NewlyClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     NewlyClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) ClosedToOpenEnum {
+// CHECK-UNVERSIONED-NEXT:     ClosedToOpenEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) OpenToClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     OpenToClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) APINotedOpenEnum {
+// CHECK-UNVERSIONED-NEXT:     APINotedOpenEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) APINotedClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     APINotedClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+
+// CHECK-VERSIONED-LABEL: enum __attribute__((flag_enum)) FlagEnum {
+// CHECK-VERSIONED-NEXT:     FlagEnumA = 1,
+// CHECK-VERSIONED-NEXT:     FlagEnumB = 2
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum NewlyFlagEnum {
+// CHECK-VERSIONED-NEXT:     NewlyFlagEnumA = 1,
+// CHECK-VERSIONED-NEXT:     NewlyFlagEnumB = 2
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((flag_enum)) APINotedFlagEnum {
+// CHECK-VERSIONED-NEXT:     APINotedFlagEnumA = 1,
+// CHECK-VERSIONED-NEXT:     APINotedFlagEnumB = 2
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("open"))) OpenEnum {
+// CHECK-VERSIONED-NEXT:     OpenEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum NewlyOpenEnum {
+// CHECK-VERSIONED-NEXT:     NewlyOpenEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum NewlyClosedEnum {
+// CHECK-VERSIONED-NEXT:     NewlyClosedEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("closed"))) ClosedToOpenEnum {
+// CHECK-VERSIONED-NEXT:     ClosedToOpenEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("open"))) OpenToClosedEnum {
+// CHECK-VERSIONED-NEXT:     OpenToClosedEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("open"))) APINotedOpenEnum {
+// CHECK-VERSIONED-NEXT:     APINotedOpenEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+// CHECK-VERSIONED-LABEL: enum __attribute__((enum_extensibility("closed"))) APINotedClosedEnum {
+// CHECK-VERSIONED-NEXT:     APINotedClosedEnumA = 1
+// CHECK-VERSIONED-NEXT: };
+
+// These don't actually have versioned information, so we just check them once.
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) SoonToBeCFEnum {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeCFEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) SoonToBeNSEnum {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeNSEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) __attribute__((flag_enum)) SoonToBeCFOptions {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeCFOptionsA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("open"))) __attribute__((flag_enum)) SoonToBeNSOptions {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeNSOptionsA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) SoonToBeCFClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeCFClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum  __attribute__((enum_extensibility("closed"))) SoonToBeNSClosedEnum {
+// CHECK-UNVERSIONED-NEXT:     SoonToBeNSClosedEnumA = 1
+// CHECK-UNVERSIONED-NEXT: };
+// CHECK-UNVERSIONED-LABEL: enum UndoAllThatHasBeenDoneToMe {
+// CHECK-UNVERSIONED-NEXT:     UndoAllThatHasBeenDoneToMeA = 1
+// CHECK-UNVERSIONED-NEXT: };
diff --git a/clang/test/APINotes/yaml-convert-diags.c b/clang/test/APINotes/yaml-convert-diags.c
new file mode 100644
index 00000000000000..1d352dc2c52309
--- /dev/null
+++ b/clang/test/APINotes/yaml-convert-diags.c
@@ -0,0 +1,6 @@
+// RUN: rm -rf %t
+// RUN: not %clang_cc1 -fsyntax-only -fapinotes %s -I %S/Inputs/BrokenHeaders2 2>&1 | FileCheck %s
+
+#include "SomeBrokenLib.h"
+
+// CHECK: error: multiple definitions of global function 'do_something_with_pointers'
diff --git a/clang/test/APINotes/yaml-parse-diags.c b/clang/test/APINotes/yaml-parse-diags.c
new file mode 100644
index 00000000000000..3ae39ccb301d3d
--- /dev/null
+++ b/clang/test/APINotes/yaml-parse-diags.c
@@ -0,0 +1,6 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fsyntax-only -fapinotes %s -I %S/Inputs/BrokenHeaders -verify
+
+#include "SomeBrokenLib.h"
+
+// expected-error at APINotes.apinotes:4{{unknown key 'Nu llabilityOfRet'}}
diff --git a/clang/test/APINotes/yaml-reader-errors.m b/clang/test/APINotes/yaml-reader-errors.m
new file mode 100644
index 00000000000000..9e5ee34c3e4152
--- /dev/null
+++ b/clang/test/APINotes/yaml-reader-errors.m
@@ -0,0 +1,5 @@
+// RUN: rm -rf %t
+// RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fapinotes -fapinotes-modules -fmodules-cache-path=%t -I %S/Inputs/yaml-reader-errors/ -fsyntax-only %s > %t.err 2>&1
+// RUN: FileCheck %S/Inputs/yaml-reader-errors/UIKit.apinotes < %t.err
+
+ at import UIKit;

>From b343b02a88821cab320e7d9976a05eabd0df29ec Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Wed, 27 Mar 2024 13:14:36 +0000
Subject: [PATCH 22/54] [gn build] Port 4f9aab2b500d

---
 llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
index c0ea62716fd20a..daa3278d56d7c1 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Common/BUILD.gn
@@ -18,9 +18,11 @@ static_library("Common") {
     "DAGISelMatcher.cpp",
     "GlobalISel/CXXPredicates.cpp",
     "GlobalISel/CodeExpander.cpp",
+    "GlobalISel/CombinerUtils.cpp",
     "GlobalISel/GlobalISelMatchTable.cpp",
     "GlobalISel/GlobalISelMatchTableExecutorEmitter.cpp",
     "GlobalISel/MatchDataInfo.cpp",
+    "GlobalISel/PatternParser.cpp",
     "GlobalISel/Patterns.cpp",
     "InfoByHwMode.cpp",
     "OptEmitter.cpp",

>From 2fa46ca922178ec049006a1b4851058400cbada9 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar at redhat.com>
Date: Wed, 27 Mar 2024 06:25:10 -0700
Subject: [PATCH 23/54] [workflows] Update the version of the scorecard-action
 (#86753)

I'm hoping this will fix the errors we've been seeing the last few days:

2024-03-19T20:44:07.4841482Z 2024/03/19 20:44:07 error signing scorecard
json results: error signing payload: getting key from Fulcio: verifying
SCT: updating local metadata and targets: error updating to TUF remote
mirror: invalid key
---
 .github/workflows/scorecard.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index b8e8ab26c3ffa6..ff61cf83a6af3c 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -36,7 +36,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action at e38b1902ae4f44df626f11ba0734b14fb91f8f86 # v2.1.2
+        uses: ossf/scorecard-action at 0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
         with:
           results_file: results.sarif
           results_format: sarif

>From 6e6d266fb8cc1398e7d5a220a9332d88ce074464 Mon Sep 17 00:00:00 2001
From: zibi2 <62662650+zibi2 at users.noreply.github.com>
Date: Wed, 27 Mar 2024 09:50:25 -0400
Subject: [PATCH 24/54] [libc++] Fix one case in saturate_cast.pass.cpp for
 64-bit on z/OS (#86724)

On z/OS int128 is disabled causing one of the cases in
`saturate_cast.pass.cpp` to fail. The failure is only in 64-bit mode.
In this case `the std::numeric_limits<long long int>::max()` is within
`std::numeric_limits<unsigned long int>::min()`
and `std::numeric_limits<unsigned long int>::max()` therefore,
saturate_cast<unsigned long int>( sBigMax) == LONG_MAX and not ULONG_MAX
as original test.

In 32-bit, `saturate_cast<unsigned long int>( sBigMax) == ULONG_MAX`
like on other platforms where int128 is enabled.

This PR is required to pass this test case on z/OS and possibly on other
platforms where int128 is not supported/enabled.

---------

Co-authored-by: Sean Perry <perry at ca.ibm.com>
---
 .../numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
index c06a9ed2d5cb42..cbca37e3a66139 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
@@ -329,7 +329,7 @@ constexpr bool test() {
   { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(sBigMax); }
   assert(std::saturate_cast<unsigned long int>(  sBigMin) == 0UL);       // saturated
   assert(std::saturate_cast<unsigned long int>(    sZero) == 0UL);
-  assert(std::saturate_cast<unsigned long int>(  sBigMax) == ULONG_MAX); // saturated
+  assert(std::saturate_cast<unsigned long int>(  sBigMax) == (sizeof(UIntT) > sizeof(unsigned long int) ? ULONG_MAX : LONG_MAX)); // saturated depending on underlying types
 
   { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(uBigMax); }
   assert(std::saturate_cast<unsigned long int>(    uZero) == 0UL);

>From 2cb7ea1553a5c7be81bee4ed3c51b7727b9d2ee8 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan at apple.com>
Date: Wed, 27 Mar 2024 07:02:12 -0700
Subject: [PATCH 25/54] [lldb][nfc] Delete unused variable (#86740)

This was made unused by d9ec4b24a84addb8bd77b5d9dd990181351cf84c.
---
 lldb/source/Target/StackFrame.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 3af62f52d57546..03a74f29e76e99 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1800,7 +1800,6 @@ void StackFrame::DumpUsingSettingsFormat(Stream *strm, bool show_unique,
     return;
 
   ExecutionContext exe_ctx(shared_from_this());
-  StreamString s;
 
   const FormatEntity::Entry *frame_format = nullptr;
   Target *target = exe_ctx.GetTargetPtr();

>From f5296df97c6bdc6cb658691e5863fdbf336d4430 Mon Sep 17 00:00:00 2001
From: "Kevin P. Neal" <52762977+kpneal at users.noreply.github.com>
Date: Wed, 27 Mar 2024 10:20:00 -0400
Subject: [PATCH 26/54] [FPEnv][AMDGPU] Correct AMDGPUSimplifyLibCalls handling
 of strictfp attribute. (#86705)

The AMDGPUSimplifyLibCalls pass was lowering function calls with the
strictfp attribute to sequences that included function calls incorrectly
lacking the attribute. This patch corrects that.

The pass now also emits the correct constrained fp call instead of
normal FP instructions when in a function with the strictfp attribute.
Replacing non-constrained calls with constrained calls when required
is still on the IRBuilder's TODO list.
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp            |  2 ++
 .../CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll   | 12 ++++++------
 .../CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll  |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 84b4ccc1ae7ba7..5aa35becd842c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -657,6 +657,8 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
     return true;
 
   IRBuilder<> B(CI);
+  if (CI->isStrictFP())
+    B.setIsFPConstrained(true);
 
   if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) {
     // Under unsafe-math, evaluate calls if possible.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
index 942f459ea6b8ca..8ddaf243db92c4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
@@ -808,7 +808,7 @@ define float @test_pown_fast_f32_nobuiltin(float %x, i32 %y) {
 ; CHECK-LABEL: define float @test_pown_fast_f32_nobuiltin
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 entry:
@@ -820,11 +820,11 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 {
 ; CHECK-LABEL: define float @test_pown_fast_f32_strictfp
 ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]])
-; CHECK-NEXT:    [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float
-; CHECK-NEXT:    [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]])
+; CHECK-NEXT:    [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) #[[ATTR0]]
+; CHECK-NEXT:    [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) #[[ATTR0]]
+; CHECK-NEXT:    [[POWNI2F:%.*]] = call fast float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[Y]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]]
+; CHECK-NEXT:    [[__YLOGX:%.*]] = call fast float @llvm.experimental.constrained.fmul.f32(float [[POWNI2F]], float [[__LOG2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]]
+; CHECK-NEXT:    [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) #[[ATTR0]]
 ; CHECK-NEXT:    [[__YEVEN:%.*]] = shl i32 [[Y]], 31
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:    [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index 2ffa647d1869a5..2e64a3456c2427 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -896,7 +896,7 @@ define float @test_rootn_f32__y_neg2__strictfp(float %x) #1 {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__strictfp(
 ; CHECK-SAME: float [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) #[[ATTR0]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:

>From b43ec8e62b5f5a39be378c460339217511261400 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 27 Mar 2024 07:16:50 -0700
Subject: [PATCH 27/54] [SLP]Fix PR86798: handle phi nodes being trunced, but
 not its operands.

If the phi node is trunced, but not its operand(s), need to handle this
situation in the assertion, code already does the right transformation.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  3 +-
 .../X86/phi-node-bitwidt-op-not.ll            | 95 +++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fbf1cb6a976ff9..e1f26b922dbe4d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11926,7 +11926,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
         Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
         if (VecTy != Vec->getType()) {
-          assert((getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
+          assert((It != MinBWs.end() ||
+                  getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
                   MinBWs.contains(getOperandEntry(E, I))) &&
                  "Expected item in MinBWs.");
           Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
new file mode 100644
index 00000000000000..f376ca71c77693
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[B:%.*]], ptr [[C:%.*]], i32 [[TMP0:%.*]], ptr [[A:%.*]], i1 [[TOBOOL3_NOT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[TOBOOL3_NOT]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i1> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL3_NOT]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <4 x i32> [[TMP12]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP14:%.*]] = ashr <4 x i32> [[TMP13]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i16>
+; CHECK-NEXT:    br i1 true, label [[BB3]], label [[BB2]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i16> [ [[TMP5]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i16 [[TMP17]] to i32
+; CHECK-NEXT:    store i32 [[TMP18]], ptr [[B]], align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i16 [[TMP19]] to i32
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[TMP16]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[C]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP16]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = sext i16 [[TMP23]] to i32
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[B]], align 8
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br i1 %tobool3.not, label %bb1, label %bb2
+
+bb1:
+  %conv1.i.us = ashr i32 %0, 16
+  %cmp2.i.us = icmp slt i32 %conv1.i.us, %0
+  %sext26.us = zext i1 %cmp2.i.us to i32
+  %conv1.i.us.5 = ashr i32 %0, 16
+  %cmp2.i.us.5 = icmp slt i32 %conv1.i.us.5, %0
+  %sext26.us.5 = zext i1 %cmp2.i.us.5 to i32
+  %conv1.i.us.6 = ashr i32 %0, 16
+  %cmp2.i.us.6 = icmp slt i32 %conv1.i.us.6, %0
+  %sext26.us.6 = zext i1 %cmp2.i.us.6 to i32
+  %conv1.i.us.7 = ashr i32 %0, 16
+  %cmp2.i.us.7 = icmp slt i32 %conv1.i.us.7, %0
+  %sext26.us.7 = zext i1 %cmp2.i.us.7 to i32
+  br label %bb3
+
+bb2:
+  %cmp2.i = icmp sgt i32 %0, 0
+  %1 = zext i1 %cmp2.i to i32
+  %cond.i = select i1 %tobool3.not, i32 %0, i32 %1
+  %sext26 = shl i32 %cond.i, 16
+  %conv13 = ashr i32 %sext26, 16
+  %cmp2.i.5 = icmp sgt i32 %0, 0
+  %2 = zext i1 %cmp2.i.5 to i32
+  %cond.i.5 = select i1 %tobool3.not, i32 %0, i32 %2
+  %sext26.5 = shl i32 %cond.i.5, 16
+  %conv13.5 = ashr i32 %sext26.5, 16
+  %cmp2.i.6 = icmp sgt i32 %0, 0
+  %3 = zext i1 %cmp2.i.6 to i32
+  %cond.i.6 = select i1 %tobool3.not, i32 %0, i32 %3
+  %sext26.6 = shl i32 %cond.i.6, 16
+  %conv13.6 = ashr i32 %sext26.6, 16
+  %cmp2.i.7 = icmp sgt i32 %0, 0
+  %4 = zext i1 %cmp2.i.7 to i32
+  %cond.i.7 = select i1 %tobool3.not, i32 %0, i32 %4
+  %sext26.7 = shl i32 %cond.i.7, 16
+  %conv13.7 = ashr i32 %sext26.7, 16
+  br i1 true, label %bb3, label %bb2
+
+bb3:
+  %conv13p = phi i32 [ %sext26.us, %bb1 ], [ %conv13, %bb2 ]
+  %conv13.5p = phi i32 [ %sext26.us.5, %bb1 ], [ %conv13.5, %bb2 ]
+  %conv13.6p = phi i32 [ %sext26.us.6, %bb1 ], [ %conv13.6, %bb2 ]
+  %conv13.7p = phi i32 [ %sext26.us.7, %bb1 ], [ %conv13.7, %bb2 ]
+  store i32 %conv13p, ptr %b, align 16
+  store i32 %conv13.5p, ptr %a, align 8
+  store i32 %conv13.6p, ptr %c, align 16
+  store i32 %conv13.7p, ptr %b, align 8
+  ret i32 0
+}

>From 11b20d7ab09511d9e2bcd40606dfd3b31976efe0 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu at gmail.com>
Date: Wed, 27 Mar 2024 15:31:55 +0100
Subject: [PATCH 28/54] [clang] Fix an out-of-bound crash when checking
 template partial specializations. (#86794)

I found this issue (a separate one) during the investigation of #86757,
the crash is similar in substituteParameterMappings, but at different
inner places.

This was an out-of-bound issue where we access front element in an empty
written template argument list to get the instantiation source range.
This patch fixes it by adding a proper guard.
---
 clang/docs/ReleaseNotes.rst          |  1 +
 clang/lib/Sema/SemaConcept.cpp       | 12 ++++++++++--
 clang/test/SemaTemplate/concepts.cpp | 10 +++++++++-
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0dd026a5de5c6f..0fdd9e3fb3eee2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -454,6 +454,7 @@ Bug Fixes to C++ Support
 - Fix a crash when instantiating a lambda that captures ``this`` outside of its context. Fixes (#GH85343).
 - Fix an issue where a namespace alias could be defined using a qualified name (all name components
   following the first `::` were ignored).
+- Fix an out-of-bounds crash when checking the validity of template partial specializations. (part of #GH86757).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 1c546e9f5894f0..b6c4d3d540ef50 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -1269,10 +1269,18 @@ substituteParameterMappings(Sema &S, NormalizedConstraint &N,
                     : SourceLocation()));
     Atomic.ParameterMapping.emplace(TempArgs,  OccurringIndices.count());
   }
+  SourceLocation InstLocBegin =
+      ArgsAsWritten->arguments().empty()
+          ? ArgsAsWritten->getLAngleLoc()
+          : ArgsAsWritten->arguments().front().getSourceRange().getBegin();
+  SourceLocation InstLocEnd =
+      ArgsAsWritten->arguments().empty()
+          ? ArgsAsWritten->getRAngleLoc()
+          : ArgsAsWritten->arguments().front().getSourceRange().getEnd();
   Sema::InstantiatingTemplate Inst(
-      S, ArgsAsWritten->arguments().front().getSourceRange().getBegin(),
+      S, InstLocBegin,
       Sema::InstantiatingTemplate::ParameterMappingSubstitution{}, Concept,
-      ArgsAsWritten->arguments().front().getSourceRange());
+      {InstLocBegin, InstLocEnd});
   if (S.SubstTemplateArguments(*Atomic.ParameterMapping, MLTAL, SubstArgs))
     return true;
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index b7ea0d003a52d7..787cc809e25353 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++20 -verify %s
+// RUN: %clang_cc1 -std=c++20 -ferror-limit 0 -verify %s
 
 namespace PR47043 {
   template<typename T> concept True = true;
@@ -1114,3 +1114,11 @@ void foo() {
 }
 
 } // namespace GH64808
+
+namespace GH86757_1 {
+template <typename...> concept b = false;
+template <typename> concept c = b<>;
+template <typename d> concept f = c< d >;
+template <f> struct e; // expected-note {{}}
+template <f d> struct e<d>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+}

>From 9f84594e4ef87a50d1599814ba99fb735da76826 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu at google.com>
Date: Wed, 27 Mar 2024 10:33:25 -0400
Subject: [PATCH 29/54] [lldb][Dwarf] Add missing timer when parsing
 .debug_abbrev. (#86568)

The time spent on parsing `.debug_abbrev` is also part of debug info
parsing time.
---
 lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 5f67658f86ea96..1164bc62682a9a 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -693,6 +693,7 @@ llvm::DWARFDebugAbbrev *SymbolFileDWARF::DebugAbbrev() {
   if (debug_abbrev_data.GetByteSize() == 0)
     return nullptr;
 
+  ElapsedTime elapsed(m_parse_time);
   auto abbr =
       std::make_unique<llvm::DWARFDebugAbbrev>(debug_abbrev_data.GetAsLLVM());
   llvm::Error error = abbr->parse();

>From 6d3ec56d3ce1478ac42a400a80532b8f732477fe Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Mar 2024 14:40:30 +0000
Subject: [PATCH 30/54] [X86] combineExtractWithShuffle - use
 combineExtractFromVectorLoad to extract scalar load from shuffled vector load

Improves #85419
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |   6 +
 llvm/test/CodeGen/X86/extractelement-load.ll | 118 ++-
 llvm/test/CodeGen/X86/masked_store.ll        | 714 +++++++------------
 llvm/test/CodeGen/X86/shrink_vmul.ll         | 223 +++---
 4 files changed, 397 insertions(+), 664 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4cd0bebe01bb48..a229f6e55a9880 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44234,6 +44234,12 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
     return DAG.getZExtOrTrunc(V, dl, VT);
 
+  if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT &&
+      SrcOp.getValueType() == SrcVT)
+    if (SDValue V =
+            combineExtractFromVectorLoad(N, SrcOp, ExtractIdx, dl, DAG, DCI))
+      return V;
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index e3e1cdcd7f56ee..ba2217f704bd72 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -10,20 +10,13 @@ define i32 @t(ptr %val) nounwind  {
 ; X86-SSE2-LABEL: t:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    movl 8(%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-SSSE3-LABEL: t:
-; X64-SSSE3:       # %bb.0:
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
-; X64-SSSE3-NEXT:    movd %xmm0, %eax
-; X64-SSSE3-NEXT:    retq
-;
-; X64-AVX-LABEL: t:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    movl 8(%rdi), %eax
-; X64-AVX-NEXT:    retq
+; X64-LABEL: t:
+; X64:       # %bb.0:
+; X64-NEXT:    movl 8(%rdi), %eax
+; X64-NEXT:    retq
   %tmp2 = load <2 x i64>, ptr %val, align 16		; <<2 x i64>> [#uses=1]
   %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32>		; <<4 x i32>> [#uses=1]
   %tmp4 = extractelement <4 x i32> %tmp3, i32 2		; <i32> [#uses=1]
@@ -286,15 +279,14 @@ entry:
 define i32 @PR85419(ptr %p0) {
 ; X86-SSE2-LABEL: PR85419:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqa (%eax), %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm1, %ecx
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    orl (%eax), %ecx
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    cmovel %edx, %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl (%ecx), %edx
+; X86-SSE2-NEXT:    xorl %eax, %eax
+; X86-SSE2-NEXT:    orl 4(%ecx), %edx
+; X86-SSE2-NEXT:    je .LBB8_2
+; X86-SSE2-NEXT:  # %bb.1:
+; X86-SSE2-NEXT:    movl 8(%ecx), %eax
+; X86-SSE2-NEXT:  .LBB8_2:
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-SSSE3-LABEL: PR85419:
@@ -443,35 +435,35 @@ define i32 @main() nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-32, %esp
 ; X86-SSE2-NEXT:    subl $64, %esp
-; X86-SSE2-NEXT:    movdqa zero, %xmm0
-; X86-SSE2-NEXT:    movaps n1+16, %xmm1
-; X86-SSE2-NEXT:    movaps n1, %xmm2
-; X86-SSE2-NEXT:    movaps %xmm2, zero
-; X86-SSE2-NEXT:    movaps %xmm1, zero+16
-; X86-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [2,2,2,2]
-; X86-SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movaps %xmm1, (%esp)
-; X86-SSE2-NEXT:    movdqa (%esp), %xmm1
-; X86-SSE2-NEXT:    movaps {{[0-9]+}}(%esp), %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm2, %eax
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movaps n1+16, %xmm0
+; X86-SSE2-NEXT:    movaps n1, %xmm1
+; X86-SSE2-NEXT:    movl zero+4, %ecx
+; X86-SSE2-NEXT:    movl zero+8, %eax
+; X86-SSE2-NEXT:    movaps %xmm1, zero
+; X86-SSE2-NEXT:    movaps %xmm0, zero+16
+; X86-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [2,2,2,2]
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE2-NEXT:    movdqa (%esp), %xmm0
+; X86-SSE2-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    movd %xmm1, %esi
 ; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    divl %ecx
-; X86-SSE2-NEXT:    movl %eax, %ecx
+; X86-SSE2-NEXT:    divl %esi
+; X86-SSE2-NEXT:    movl %eax, %esi
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm0, %esi
+; X86-SSE2-NEXT:    movd %xmm0, %edi
+; X86-SSE2-NEXT:    movl %ecx, %eax
 ; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    divl %esi
-; X86-SSE2-NEXT:    addl %ecx, %eax
-; X86-SSE2-NEXT:    leal -4(%ebp), %esp
+; X86-SSE2-NEXT:    divl %edi
+; X86-SSE2-NEXT:    addl %esi, %eax
+; X86-SSE2-NEXT:    leal -8(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
@@ -481,31 +473,29 @@ define i32 @main() nounwind {
 ; X64-SSSE3-NEXT:    movq %rsp, %rbp
 ; X64-SSSE3-NEXT:    andq $-32, %rsp
 ; X64-SSSE3-NEXT:    subq $64, %rsp
-; X64-SSSE3-NEXT:    movdqa zero(%rip), %xmm0
 ; X64-SSSE3-NEXT:    movq n1 at GOTPCREL(%rip), %rax
-; X64-SSSE3-NEXT:    movaps (%rax), %xmm1
-; X64-SSSE3-NEXT:    movaps 16(%rax), %xmm2
-; X64-SSSE3-NEXT:    movaps %xmm1, zero(%rip)
-; X64-SSSE3-NEXT:    movaps %xmm2, zero+16(%rip)
-; X64-SSSE3-NEXT:    movaps {{.*#+}} xmm1 = [2,2,2,2]
-; X64-SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT:    movaps %xmm1, (%rsp)
-; X64-SSSE3-NEXT:    movdqa (%rsp), %xmm1
-; X64-SSSE3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-SSSE3-NEXT:    movd %xmm2, %eax
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSSE3-NEXT:    movd %xmm2, %ecx
+; X64-SSSE3-NEXT:    movaps (%rax), %xmm0
+; X64-SSSE3-NEXT:    movaps 16(%rax), %xmm1
+; X64-SSSE3-NEXT:    movl zero+4(%rip), %ecx
+; X64-SSSE3-NEXT:    movl zero+8(%rip), %eax
+; X64-SSSE3-NEXT:    movaps %xmm0, zero(%rip)
+; X64-SSSE3-NEXT:    movaps %xmm1, zero+16(%rip)
+; X64-SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [2,2,2,2]
+; X64-SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSSE3-NEXT:    movaps %xmm0, (%rsp)
+; X64-SSSE3-NEXT:    movdqa (%rsp), %xmm0
+; X64-SSSE3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X64-SSSE3-NEXT:    movd %xmm1, %esi
 ; X64-SSSE3-NEXT:    xorl %edx, %edx
-; X64-SSSE3-NEXT:    divl %ecx
-; X64-SSSE3-NEXT:    movl %eax, %ecx
+; X64-SSSE3-NEXT:    divl %esi
+; X64-SSSE3-NEXT:    movl %eax, %esi
 ; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X64-SSSE3-NEXT:    movd %xmm0, %eax
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X64-SSSE3-NEXT:    movd %xmm0, %esi
+; X64-SSSE3-NEXT:    movd %xmm0, %edi
+; X64-SSSE3-NEXT:    movl %ecx, %eax
 ; X64-SSSE3-NEXT:    xorl %edx, %edx
-; X64-SSSE3-NEXT:    divl %esi
-; X64-SSSE3-NEXT:    addl %ecx, %eax
+; X64-SSSE3-NEXT:    divl %edi
+; X64-SSSE3-NEXT:    addl %esi, %eax
 ; X64-SSSE3-NEXT:    movq %rbp, %rsp
 ; X64-SSSE3-NEXT:    popq %rbp
 ; X64-SSSE3-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 03245ea31730e4..6aa0a81c90204d 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -5638,479 +5638,247 @@ define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask)
 }
 
 define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigger.ptr, ptr %val.ptr, ptr %dst) nounwind {
-; SSE2-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
-; SSE2:       ## %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm6
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm7
-; SSE2-NEXT:    movdqa 64(%rdi), %xmm8
-; SSE2-NEXT:    movl 80(%rsi), %eax
-; SSE2-NEXT:    movl 64(%rsi), %r8d
-; SSE2-NEXT:    movl 48(%rsi), %r9d
-; SSE2-NEXT:    movl 32(%rsi), %r10d
-; SSE2-NEXT:    movl 16(%rsi), %r11d
-; SSE2-NEXT:    movdqa 80(%rsi), %xmm0
-; SSE2-NEXT:    movdqa 64(%rsi), %xmm1
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm2
-; SSE2-NEXT:    movdqa 32(%rsi), %xmm3
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm4
-; SSE2-NEXT:    movdqa (%rsi), %xmm5
-; SSE2-NEXT:    packssdw 48(%rdi), %xmm7
-; SSE2-NEXT:    packssdw 16(%rdi), %xmm6
-; SSE2-NEXT:    packsswb %xmm7, %xmm6
-; SSE2-NEXT:    packssdw 80(%rdi), %xmm8
-; SSE2-NEXT:    packsswb %xmm8, %xmm8
-; SSE2-NEXT:    pmovmskb %xmm6, %edi
-; SSE2-NEXT:    andl $21845, %edi ## imm = 0x5555
-; SSE2-NEXT:    pmovmskb %xmm8, %ecx
-; SSE2-NEXT:    andl $85, %ecx
-; SSE2-NEXT:    shll $16, %ecx
-; SSE2-NEXT:    orl %edi, %ecx
-; SSE2-NEXT:    testb $1, %cl
-; SSE2-NEXT:    jne LBB31_1
-; SSE2-NEXT:  ## %bb.2: ## %else
-; SSE2-NEXT:    testb $2, %cl
-; SSE2-NEXT:    jne LBB31_3
-; SSE2-NEXT:  LBB31_4: ## %else2
-; SSE2-NEXT:    testb $4, %cl
-; SSE2-NEXT:    jne LBB31_5
-; SSE2-NEXT:  LBB31_6: ## %else4
-; SSE2-NEXT:    testb $8, %cl
-; SSE2-NEXT:    jne LBB31_7
-; SSE2-NEXT:  LBB31_8: ## %else6
-; SSE2-NEXT:    testb $16, %cl
-; SSE2-NEXT:    jne LBB31_9
-; SSE2-NEXT:  LBB31_10: ## %else8
-; SSE2-NEXT:    testb $32, %cl
-; SSE2-NEXT:    jne LBB31_11
-; SSE2-NEXT:  LBB31_12: ## %else10
-; SSE2-NEXT:    testb $64, %cl
-; SSE2-NEXT:    jne LBB31_13
-; SSE2-NEXT:  LBB31_14: ## %else12
-; SSE2-NEXT:    testb %cl, %cl
-; SSE2-NEXT:    js LBB31_15
-; SSE2-NEXT:  LBB31_16: ## %else14
-; SSE2-NEXT:    testl $256, %ecx ## imm = 0x100
-; SSE2-NEXT:    jne LBB31_17
-; SSE2-NEXT:  LBB31_18: ## %else16
-; SSE2-NEXT:    testl $512, %ecx ## imm = 0x200
-; SSE2-NEXT:    jne LBB31_19
-; SSE2-NEXT:  LBB31_20: ## %else18
-; SSE2-NEXT:    testl $1024, %ecx ## imm = 0x400
-; SSE2-NEXT:    jne LBB31_21
-; SSE2-NEXT:  LBB31_22: ## %else20
-; SSE2-NEXT:    testl $2048, %ecx ## imm = 0x800
-; SSE2-NEXT:    jne LBB31_23
-; SSE2-NEXT:  LBB31_24: ## %else22
-; SSE2-NEXT:    testl $4096, %ecx ## imm = 0x1000
-; SSE2-NEXT:    jne LBB31_25
-; SSE2-NEXT:  LBB31_26: ## %else24
-; SSE2-NEXT:    testl $8192, %ecx ## imm = 0x2000
-; SSE2-NEXT:    jne LBB31_27
-; SSE2-NEXT:  LBB31_28: ## %else26
-; SSE2-NEXT:    testl $16384, %ecx ## imm = 0x4000
-; SSE2-NEXT:    jne LBB31_29
-; SSE2-NEXT:  LBB31_30: ## %else28
-; SSE2-NEXT:    testw %cx, %cx
-; SSE2-NEXT:    js LBB31_31
-; SSE2-NEXT:  LBB31_32: ## %else30
-; SSE2-NEXT:    testl $65536, %ecx ## imm = 0x10000
-; SSE2-NEXT:    jne LBB31_33
-; SSE2-NEXT:  LBB31_34: ## %else32
-; SSE2-NEXT:    testl $131072, %ecx ## imm = 0x20000
-; SSE2-NEXT:    jne LBB31_35
-; SSE2-NEXT:  LBB31_36: ## %else34
-; SSE2-NEXT:    testl $262144, %ecx ## imm = 0x40000
-; SSE2-NEXT:    jne LBB31_37
-; SSE2-NEXT:  LBB31_38: ## %else36
-; SSE2-NEXT:    testl $524288, %ecx ## imm = 0x80000
-; SSE2-NEXT:    jne LBB31_39
-; SSE2-NEXT:  LBB31_40: ## %else38
-; SSE2-NEXT:    testl $1048576, %ecx ## imm = 0x100000
-; SSE2-NEXT:    jne LBB31_41
-; SSE2-NEXT:  LBB31_42: ## %else40
-; SSE2-NEXT:    testl $2097152, %ecx ## imm = 0x200000
-; SSE2-NEXT:    jne LBB31_43
-; SSE2-NEXT:  LBB31_44: ## %else42
-; SSE2-NEXT:    testl $4194304, %ecx ## imm = 0x400000
-; SSE2-NEXT:    je LBB31_46
-; SSE2-NEXT:  LBB31_45: ## %cond.store43
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    movl %eax, 88(%rdx)
-; SSE2-NEXT:  LBB31_46: ## %else44
-; SSE2-NEXT:    movb $1, %al
-; SSE2-NEXT:    testb %al, %al
-; SSE2-NEXT:    jne LBB31_48
-; SSE2-NEXT:  ## %bb.47: ## %cond.store45
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movl %eax, 92(%rdx)
-; SSE2-NEXT:  LBB31_48: ## %else46
-; SSE2-NEXT:    retq
-; SSE2-NEXT:  LBB31_1: ## %cond.store
-; SSE2-NEXT:    movl (%rsi), %esi
-; SSE2-NEXT:    movl %esi, (%rdx)
-; SSE2-NEXT:    testb $2, %cl
-; SSE2-NEXT:    je LBB31_4
-; SSE2-NEXT:  LBB31_3: ## %cond.store1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
-; SSE2-NEXT:    movd %xmm6, %esi
-; SSE2-NEXT:    movl %esi, 4(%rdx)
-; SSE2-NEXT:    testb $4, %cl
-; SSE2-NEXT:    je LBB31_6
-; SSE2-NEXT:  LBB31_5: ## %cond.store3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; SSE2-NEXT:    movd %xmm6, %esi
-; SSE2-NEXT:    movl %esi, 8(%rdx)
-; SSE2-NEXT:    testb $8, %cl
-; SSE2-NEXT:    je LBB31_8
-; SSE2-NEXT:  LBB31_7: ## %cond.store5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; SSE2-NEXT:    movd %xmm5, %esi
-; SSE2-NEXT:    movl %esi, 12(%rdx)
-; SSE2-NEXT:    testb $16, %cl
-; SSE2-NEXT:    je LBB31_10
-; SSE2-NEXT:  LBB31_9: ## %cond.store7
-; SSE2-NEXT:    movl %r11d, 16(%rdx)
-; SSE2-NEXT:    testb $32, %cl
-; SSE2-NEXT:    je LBB31_12
-; SSE2-NEXT:  LBB31_11: ## %cond.store9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1]
-; SSE2-NEXT:    movd %xmm5, %esi
-; SSE2-NEXT:    movl %esi, 20(%rdx)
-; SSE2-NEXT:    testb $64, %cl
-; SSE2-NEXT:    je LBB31_14
-; SSE2-NEXT:  LBB31_13: ## %cond.store11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; SSE2-NEXT:    movd %xmm5, %esi
-; SSE2-NEXT:    movl %esi, 24(%rdx)
-; SSE2-NEXT:    testb %cl, %cl
-; SSE2-NEXT:    jns LBB31_16
-; SSE2-NEXT:  LBB31_15: ## %cond.store13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; SSE2-NEXT:    movd %xmm4, %esi
-; SSE2-NEXT:    movl %esi, 28(%rdx)
-; SSE2-NEXT:    testl $256, %ecx ## imm = 0x100
-; SSE2-NEXT:    je LBB31_18
-; SSE2-NEXT:  LBB31_17: ## %cond.store15
-; SSE2-NEXT:    movl %r10d, 32(%rdx)
-; SSE2-NEXT:    testl $512, %ecx ## imm = 0x200
-; SSE2-NEXT:    je LBB31_20
-; SSE2-NEXT:  LBB31_19: ## %cond.store17
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
-; SSE2-NEXT:    movd %xmm4, %esi
-; SSE2-NEXT:    movl %esi, 36(%rdx)
-; SSE2-NEXT:    testl $1024, %ecx ## imm = 0x400
-; SSE2-NEXT:    je LBB31_22
-; SSE2-NEXT:  LBB31_21: ## %cond.store19
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movd %xmm4, %esi
-; SSE2-NEXT:    movl %esi, 40(%rdx)
-; SSE2-NEXT:    testl $2048, %ecx ## imm = 0x800
-; SSE2-NEXT:    je LBB31_24
-; SSE2-NEXT:  LBB31_23: ## %cond.store21
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSE2-NEXT:    movd %xmm3, %esi
-; SSE2-NEXT:    movl %esi, 44(%rdx)
-; SSE2-NEXT:    testl $4096, %ecx ## imm = 0x1000
-; SSE2-NEXT:    je LBB31_26
-; SSE2-NEXT:  LBB31_25: ## %cond.store23
-; SSE2-NEXT:    movl %r9d, 48(%rdx)
-; SSE2-NEXT:    testl $8192, %ecx ## imm = 0x2000
-; SSE2-NEXT:    je LBB31_28
-; SSE2-NEXT:  LBB31_27: ## %cond.store25
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE2-NEXT:    movd %xmm3, %esi
-; SSE2-NEXT:    movl %esi, 52(%rdx)
-; SSE2-NEXT:    testl $16384, %ecx ## imm = 0x4000
-; SSE2-NEXT:    je LBB31_30
-; SSE2-NEXT:  LBB31_29: ## %cond.store27
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE2-NEXT:    movd %xmm3, %esi
-; SSE2-NEXT:    movl %esi, 56(%rdx)
-; SSE2-NEXT:    testw %cx, %cx
-; SSE2-NEXT:    jns LBB31_32
-; SSE2-NEXT:  LBB31_31: ## %cond.store29
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSE2-NEXT:    movd %xmm2, %esi
-; SSE2-NEXT:    movl %esi, 60(%rdx)
-; SSE2-NEXT:    testl $65536, %ecx ## imm = 0x10000
-; SSE2-NEXT:    je LBB31_34
-; SSE2-NEXT:  LBB31_33: ## %cond.store31
-; SSE2-NEXT:    movl %r8d, 64(%rdx)
-; SSE2-NEXT:    testl $131072, %ecx ## imm = 0x20000
-; SSE2-NEXT:    je LBB31_36
-; SSE2-NEXT:  LBB31_35: ## %cond.store33
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; SSE2-NEXT:    movd %xmm2, %esi
-; SSE2-NEXT:    movl %esi, 68(%rdx)
-; SSE2-NEXT:    testl $262144, %ecx ## imm = 0x40000
-; SSE2-NEXT:    je LBB31_38
-; SSE2-NEXT:  LBB31_37: ## %cond.store35
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movd %xmm2, %esi
-; SSE2-NEXT:    movl %esi, 72(%rdx)
-; SSE2-NEXT:    testl $524288, %ecx ## imm = 0x80000
-; SSE2-NEXT:    je LBB31_40
-; SSE2-NEXT:  LBB31_39: ## %cond.store37
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE2-NEXT:    movd %xmm1, %esi
-; SSE2-NEXT:    movl %esi, 76(%rdx)
-; SSE2-NEXT:    testl $1048576, %ecx ## imm = 0x100000
-; SSE2-NEXT:    je LBB31_42
-; SSE2-NEXT:  LBB31_41: ## %cond.store39
-; SSE2-NEXT:    movl %eax, 80(%rdx)
-; SSE2-NEXT:    testl $2097152, %ecx ## imm = 0x200000
-; SSE2-NEXT:    je LBB31_44
-; SSE2-NEXT:  LBB31_43: ## %cond.store41
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    movl %eax, 84(%rdx)
-; SSE2-NEXT:    testl $4194304, %ecx ## imm = 0x400000
-; SSE2-NEXT:    jne LBB31_45
-; SSE2-NEXT:    jmp LBB31_46
-;
-; SSE4-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
-; SSE4:       ## %bb.0:
-; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    movdqa (%rdi), %xmm1
-; SSE4-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE4-NEXT:    movdqa 64(%rdi), %xmm0
-; SSE4-NEXT:    movl 92(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 88(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 84(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 80(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 76(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 72(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 68(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 64(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 60(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 56(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    movl 52(%rsi), %eax
-; SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SSE4-NEXT:    packssdw 48(%rdi), %xmm2
-; SSE4-NEXT:    packssdw 16(%rdi), %xmm1
-; SSE4-NEXT:    packsswb %xmm2, %xmm1
-; SSE4-NEXT:    packssdw 80(%rdi), %xmm0
-; SSE4-NEXT:    packsswb %xmm0, %xmm0
-; SSE4-NEXT:    pmovmskb %xmm1, %eax
-; SSE4-NEXT:    andl $21845, %eax ## imm = 0x5555
-; SSE4-NEXT:    pmovmskb %xmm0, %edi
-; SSE4-NEXT:    andl $85, %edi
-; SSE4-NEXT:    shll $16, %edi
-; SSE4-NEXT:    orl %eax, %edi
-; SSE4-NEXT:    movl 48(%rsi), %r13d
-; SSE4-NEXT:    testb $1, %dil
-; SSE4-NEXT:    movl 44(%rsi), %eax
-; SSE4-NEXT:    movl 40(%rsi), %ecx
-; SSE4-NEXT:    movl 36(%rsi), %r8d
-; SSE4-NEXT:    movl 32(%rsi), %r9d
-; SSE4-NEXT:    movl 28(%rsi), %r10d
-; SSE4-NEXT:    movl 24(%rsi), %r11d
-; SSE4-NEXT:    movl 20(%rsi), %ebx
-; SSE4-NEXT:    movl 16(%rsi), %ebp
-; SSE4-NEXT:    movl 12(%rsi), %r14d
-; SSE4-NEXT:    movl 8(%rsi), %r15d
-; SSE4-NEXT:    movl 4(%rsi), %r12d
-; SSE4-NEXT:    jne LBB31_1
-; SSE4-NEXT:  ## %bb.2: ## %else
-; SSE4-NEXT:    testb $2, %dil
-; SSE4-NEXT:    jne LBB31_3
-; SSE4-NEXT:  LBB31_4: ## %else2
-; SSE4-NEXT:    testb $4, %dil
-; SSE4-NEXT:    jne LBB31_5
-; SSE4-NEXT:  LBB31_6: ## %else4
-; SSE4-NEXT:    testb $8, %dil
-; SSE4-NEXT:    jne LBB31_7
-; SSE4-NEXT:  LBB31_8: ## %else6
-; SSE4-NEXT:    testb $16, %dil
-; SSE4-NEXT:    jne LBB31_9
-; SSE4-NEXT:  LBB31_10: ## %else8
-; SSE4-NEXT:    testb $32, %dil
-; SSE4-NEXT:    jne LBB31_11
-; SSE4-NEXT:  LBB31_12: ## %else10
-; SSE4-NEXT:    testb $64, %dil
-; SSE4-NEXT:    jne LBB31_13
-; SSE4-NEXT:  LBB31_14: ## %else12
-; SSE4-NEXT:    testb %dil, %dil
-; SSE4-NEXT:    js LBB31_15
-; SSE4-NEXT:  LBB31_16: ## %else14
-; SSE4-NEXT:    testl $256, %edi ## imm = 0x100
-; SSE4-NEXT:    jne LBB31_17
-; SSE4-NEXT:  LBB31_18: ## %else16
-; SSE4-NEXT:    testl $512, %edi ## imm = 0x200
-; SSE4-NEXT:    jne LBB31_19
-; SSE4-NEXT:  LBB31_20: ## %else18
-; SSE4-NEXT:    testl $1024, %edi ## imm = 0x400
-; SSE4-NEXT:    jne LBB31_21
-; SSE4-NEXT:  LBB31_22: ## %else20
-; SSE4-NEXT:    testl $2048, %edi ## imm = 0x800
-; SSE4-NEXT:    jne LBB31_23
-; SSE4-NEXT:  LBB31_24: ## %else22
-; SSE4-NEXT:    testl $4096, %edi ## imm = 0x1000
-; SSE4-NEXT:    jne LBB31_25
-; SSE4-NEXT:  LBB31_26: ## %else24
-; SSE4-NEXT:    testl $8192, %edi ## imm = 0x2000
-; SSE4-NEXT:    jne LBB31_27
-; SSE4-NEXT:  LBB31_28: ## %else26
-; SSE4-NEXT:    testl $16384, %edi ## imm = 0x4000
-; SSE4-NEXT:    jne LBB31_29
-; SSE4-NEXT:  LBB31_30: ## %else28
-; SSE4-NEXT:    testw %di, %di
-; SSE4-NEXT:    js LBB31_31
-; SSE4-NEXT:  LBB31_32: ## %else30
-; SSE4-NEXT:    testl $65536, %edi ## imm = 0x10000
-; SSE4-NEXT:    jne LBB31_33
-; SSE4-NEXT:  LBB31_34: ## %else32
-; SSE4-NEXT:    testl $131072, %edi ## imm = 0x20000
-; SSE4-NEXT:    jne LBB31_35
-; SSE4-NEXT:  LBB31_36: ## %else34
-; SSE4-NEXT:    testl $262144, %edi ## imm = 0x40000
-; SSE4-NEXT:    jne LBB31_37
-; SSE4-NEXT:  LBB31_38: ## %else36
-; SSE4-NEXT:    testl $524288, %edi ## imm = 0x80000
-; SSE4-NEXT:    jne LBB31_39
-; SSE4-NEXT:  LBB31_40: ## %else38
-; SSE4-NEXT:    testl $1048576, %edi ## imm = 0x100000
-; SSE4-NEXT:    jne LBB31_41
-; SSE4-NEXT:  LBB31_42: ## %else40
-; SSE4-NEXT:    testl $2097152, %edi ## imm = 0x200000
-; SSE4-NEXT:    jne LBB31_43
-; SSE4-NEXT:  LBB31_44: ## %else42
-; SSE4-NEXT:    testl $4194304, %edi ## imm = 0x400000
-; SSE4-NEXT:    je LBB31_46
-; SSE4-NEXT:  LBB31_45: ## %cond.store43
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 88(%rdx)
-; SSE4-NEXT:  LBB31_46: ## %else44
-; SSE4-NEXT:    movb $1, %al
-; SSE4-NEXT:    testb %al, %al
-; SSE4-NEXT:    jne LBB31_48
-; SSE4-NEXT:  ## %bb.47: ## %cond.store45
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 92(%rdx)
-; SSE4-NEXT:  LBB31_48: ## %else46
-; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    retq
-; SSE4-NEXT:  LBB31_1: ## %cond.store
-; SSE4-NEXT:    movl (%rsi), %esi
-; SSE4-NEXT:    movl %esi, (%rdx)
-; SSE4-NEXT:    testb $2, %dil
-; SSE4-NEXT:    je LBB31_4
-; SSE4-NEXT:  LBB31_3: ## %cond.store1
-; SSE4-NEXT:    movl %r12d, 4(%rdx)
-; SSE4-NEXT:    testb $4, %dil
-; SSE4-NEXT:    je LBB31_6
-; SSE4-NEXT:  LBB31_5: ## %cond.store3
-; SSE4-NEXT:    movl %r15d, 8(%rdx)
-; SSE4-NEXT:    testb $8, %dil
-; SSE4-NEXT:    je LBB31_8
-; SSE4-NEXT:  LBB31_7: ## %cond.store5
-; SSE4-NEXT:    movl %r14d, 12(%rdx)
-; SSE4-NEXT:    testb $16, %dil
-; SSE4-NEXT:    je LBB31_10
-; SSE4-NEXT:  LBB31_9: ## %cond.store7
-; SSE4-NEXT:    movl %ebp, 16(%rdx)
-; SSE4-NEXT:    testb $32, %dil
-; SSE4-NEXT:    je LBB31_12
-; SSE4-NEXT:  LBB31_11: ## %cond.store9
-; SSE4-NEXT:    movl %ebx, 20(%rdx)
-; SSE4-NEXT:    testb $64, %dil
-; SSE4-NEXT:    je LBB31_14
-; SSE4-NEXT:  LBB31_13: ## %cond.store11
-; SSE4-NEXT:    movl %r11d, 24(%rdx)
-; SSE4-NEXT:    testb %dil, %dil
-; SSE4-NEXT:    jns LBB31_16
-; SSE4-NEXT:  LBB31_15: ## %cond.store13
-; SSE4-NEXT:    movl %r10d, 28(%rdx)
-; SSE4-NEXT:    testl $256, %edi ## imm = 0x100
-; SSE4-NEXT:    je LBB31_18
-; SSE4-NEXT:  LBB31_17: ## %cond.store15
-; SSE4-NEXT:    movl %r9d, 32(%rdx)
-; SSE4-NEXT:    testl $512, %edi ## imm = 0x200
-; SSE4-NEXT:    je LBB31_20
-; SSE4-NEXT:  LBB31_19: ## %cond.store17
-; SSE4-NEXT:    movl %r8d, 36(%rdx)
-; SSE4-NEXT:    testl $1024, %edi ## imm = 0x400
-; SSE4-NEXT:    je LBB31_22
-; SSE4-NEXT:  LBB31_21: ## %cond.store19
-; SSE4-NEXT:    movl %ecx, 40(%rdx)
-; SSE4-NEXT:    testl $2048, %edi ## imm = 0x800
-; SSE4-NEXT:    je LBB31_24
-; SSE4-NEXT:  LBB31_23: ## %cond.store21
-; SSE4-NEXT:    movl %eax, 44(%rdx)
-; SSE4-NEXT:    testl $4096, %edi ## imm = 0x1000
-; SSE4-NEXT:    je LBB31_26
-; SSE4-NEXT:  LBB31_25: ## %cond.store23
-; SSE4-NEXT:    movl %r13d, 48(%rdx)
-; SSE4-NEXT:    testl $8192, %edi ## imm = 0x2000
-; SSE4-NEXT:    je LBB31_28
-; SSE4-NEXT:  LBB31_27: ## %cond.store25
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 52(%rdx)
-; SSE4-NEXT:    testl $16384, %edi ## imm = 0x4000
-; SSE4-NEXT:    je LBB31_30
-; SSE4-NEXT:  LBB31_29: ## %cond.store27
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 56(%rdx)
-; SSE4-NEXT:    testw %di, %di
-; SSE4-NEXT:    jns LBB31_32
-; SSE4-NEXT:  LBB31_31: ## %cond.store29
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 60(%rdx)
-; SSE4-NEXT:    testl $65536, %edi ## imm = 0x10000
-; SSE4-NEXT:    je LBB31_34
-; SSE4-NEXT:  LBB31_33: ## %cond.store31
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 64(%rdx)
-; SSE4-NEXT:    testl $131072, %edi ## imm = 0x20000
-; SSE4-NEXT:    je LBB31_36
-; SSE4-NEXT:  LBB31_35: ## %cond.store33
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 68(%rdx)
-; SSE4-NEXT:    testl $262144, %edi ## imm = 0x40000
-; SSE4-NEXT:    je LBB31_38
-; SSE4-NEXT:  LBB31_37: ## %cond.store35
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 72(%rdx)
-; SSE4-NEXT:    testl $524288, %edi ## imm = 0x80000
-; SSE4-NEXT:    je LBB31_40
-; SSE4-NEXT:  LBB31_39: ## %cond.store37
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 76(%rdx)
-; SSE4-NEXT:    testl $1048576, %edi ## imm = 0x100000
-; SSE4-NEXT:    je LBB31_42
-; SSE4-NEXT:  LBB31_41: ## %cond.store39
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 80(%rdx)
-; SSE4-NEXT:    testl $2097152, %edi ## imm = 0x200000
-; SSE4-NEXT:    je LBB31_44
-; SSE4-NEXT:  LBB31_43: ## %cond.store41
-; SSE4-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
-; SSE4-NEXT:    movl %eax, 84(%rdx)
-; SSE4-NEXT:    testl $4194304, %edi ## imm = 0x400000
-; SSE4-NEXT:    jne LBB31_45
-; SSE4-NEXT:    jmp LBB31_46
+; SSE-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE-NEXT:    movdqa 64(%rdi), %xmm0
+; SSE-NEXT:    movl 92(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 88(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 84(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 80(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 76(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 72(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 68(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 64(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 60(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 56(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    movl 52(%rsi), %eax
+; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; SSE-NEXT:    packssdw 48(%rdi), %xmm2
+; SSE-NEXT:    packssdw 16(%rdi), %xmm1
+; SSE-NEXT:    packsswb %xmm2, %xmm1
+; SSE-NEXT:    packssdw 80(%rdi), %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    pmovmskb %xmm1, %eax
+; SSE-NEXT:    andl $21845, %eax ## imm = 0x5555
+; SSE-NEXT:    pmovmskb %xmm0, %edi
+; SSE-NEXT:    andl $85, %edi
+; SSE-NEXT:    shll $16, %edi
+; SSE-NEXT:    orl %eax, %edi
+; SSE-NEXT:    movl 48(%rsi), %r13d
+; SSE-NEXT:    testb $1, %dil
+; SSE-NEXT:    movl 44(%rsi), %eax
+; SSE-NEXT:    movl 40(%rsi), %ecx
+; SSE-NEXT:    movl 36(%rsi), %r8d
+; SSE-NEXT:    movl 32(%rsi), %r9d
+; SSE-NEXT:    movl 28(%rsi), %r10d
+; SSE-NEXT:    movl 24(%rsi), %r11d
+; SSE-NEXT:    movl 20(%rsi), %ebx
+; SSE-NEXT:    movl 16(%rsi), %ebp
+; SSE-NEXT:    movl 12(%rsi), %r14d
+; SSE-NEXT:    movl 8(%rsi), %r15d
+; SSE-NEXT:    movl 4(%rsi), %r12d
+; SSE-NEXT:    jne LBB31_1
+; SSE-NEXT:  ## %bb.2: ## %else
+; SSE-NEXT:    testb $2, %dil
+; SSE-NEXT:    jne LBB31_3
+; SSE-NEXT:  LBB31_4: ## %else2
+; SSE-NEXT:    testb $4, %dil
+; SSE-NEXT:    jne LBB31_5
+; SSE-NEXT:  LBB31_6: ## %else4
+; SSE-NEXT:    testb $8, %dil
+; SSE-NEXT:    jne LBB31_7
+; SSE-NEXT:  LBB31_8: ## %else6
+; SSE-NEXT:    testb $16, %dil
+; SSE-NEXT:    jne LBB31_9
+; SSE-NEXT:  LBB31_10: ## %else8
+; SSE-NEXT:    testb $32, %dil
+; SSE-NEXT:    jne LBB31_11
+; SSE-NEXT:  LBB31_12: ## %else10
+; SSE-NEXT:    testb $64, %dil
+; SSE-NEXT:    jne LBB31_13
+; SSE-NEXT:  LBB31_14: ## %else12
+; SSE-NEXT:    testb %dil, %dil
+; SSE-NEXT:    js LBB31_15
+; SSE-NEXT:  LBB31_16: ## %else14
+; SSE-NEXT:    testl $256, %edi ## imm = 0x100
+; SSE-NEXT:    jne LBB31_17
+; SSE-NEXT:  LBB31_18: ## %else16
+; SSE-NEXT:    testl $512, %edi ## imm = 0x200
+; SSE-NEXT:    jne LBB31_19
+; SSE-NEXT:  LBB31_20: ## %else18
+; SSE-NEXT:    testl $1024, %edi ## imm = 0x400
+; SSE-NEXT:    jne LBB31_21
+; SSE-NEXT:  LBB31_22: ## %else20
+; SSE-NEXT:    testl $2048, %edi ## imm = 0x800
+; SSE-NEXT:    jne LBB31_23
+; SSE-NEXT:  LBB31_24: ## %else22
+; SSE-NEXT:    testl $4096, %edi ## imm = 0x1000
+; SSE-NEXT:    jne LBB31_25
+; SSE-NEXT:  LBB31_26: ## %else24
+; SSE-NEXT:    testl $8192, %edi ## imm = 0x2000
+; SSE-NEXT:    jne LBB31_27
+; SSE-NEXT:  LBB31_28: ## %else26
+; SSE-NEXT:    testl $16384, %edi ## imm = 0x4000
+; SSE-NEXT:    jne LBB31_29
+; SSE-NEXT:  LBB31_30: ## %else28
+; SSE-NEXT:    testw %di, %di
+; SSE-NEXT:    js LBB31_31
+; SSE-NEXT:  LBB31_32: ## %else30
+; SSE-NEXT:    testl $65536, %edi ## imm = 0x10000
+; SSE-NEXT:    jne LBB31_33
+; SSE-NEXT:  LBB31_34: ## %else32
+; SSE-NEXT:    testl $131072, %edi ## imm = 0x20000
+; SSE-NEXT:    jne LBB31_35
+; SSE-NEXT:  LBB31_36: ## %else34
+; SSE-NEXT:    testl $262144, %edi ## imm = 0x40000
+; SSE-NEXT:    jne LBB31_37
+; SSE-NEXT:  LBB31_38: ## %else36
+; SSE-NEXT:    testl $524288, %edi ## imm = 0x80000
+; SSE-NEXT:    jne LBB31_39
+; SSE-NEXT:  LBB31_40: ## %else38
+; SSE-NEXT:    testl $1048576, %edi ## imm = 0x100000
+; SSE-NEXT:    jne LBB31_41
+; SSE-NEXT:  LBB31_42: ## %else40
+; SSE-NEXT:    testl $2097152, %edi ## imm = 0x200000
+; SSE-NEXT:    jne LBB31_43
+; SSE-NEXT:  LBB31_44: ## %else42
+; SSE-NEXT:    testl $4194304, %edi ## imm = 0x400000
+; SSE-NEXT:    je LBB31_46
+; SSE-NEXT:  LBB31_45: ## %cond.store43
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 88(%rdx)
+; SSE-NEXT:  LBB31_46: ## %else44
+; SSE-NEXT:    movb $1, %al
+; SSE-NEXT:    testb %al, %al
+; SSE-NEXT:    jne LBB31_48
+; SSE-NEXT:  ## %bb.47: ## %cond.store45
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 92(%rdx)
+; SSE-NEXT:  LBB31_48: ## %else46
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+; SSE-NEXT:  LBB31_1: ## %cond.store
+; SSE-NEXT:    movl (%rsi), %esi
+; SSE-NEXT:    movl %esi, (%rdx)
+; SSE-NEXT:    testb $2, %dil
+; SSE-NEXT:    je LBB31_4
+; SSE-NEXT:  LBB31_3: ## %cond.store1
+; SSE-NEXT:    movl %r12d, 4(%rdx)
+; SSE-NEXT:    testb $4, %dil
+; SSE-NEXT:    je LBB31_6
+; SSE-NEXT:  LBB31_5: ## %cond.store3
+; SSE-NEXT:    movl %r15d, 8(%rdx)
+; SSE-NEXT:    testb $8, %dil
+; SSE-NEXT:    je LBB31_8
+; SSE-NEXT:  LBB31_7: ## %cond.store5
+; SSE-NEXT:    movl %r14d, 12(%rdx)
+; SSE-NEXT:    testb $16, %dil
+; SSE-NEXT:    je LBB31_10
+; SSE-NEXT:  LBB31_9: ## %cond.store7
+; SSE-NEXT:    movl %ebp, 16(%rdx)
+; SSE-NEXT:    testb $32, %dil
+; SSE-NEXT:    je LBB31_12
+; SSE-NEXT:  LBB31_11: ## %cond.store9
+; SSE-NEXT:    movl %ebx, 20(%rdx)
+; SSE-NEXT:    testb $64, %dil
+; SSE-NEXT:    je LBB31_14
+; SSE-NEXT:  LBB31_13: ## %cond.store11
+; SSE-NEXT:    movl %r11d, 24(%rdx)
+; SSE-NEXT:    testb %dil, %dil
+; SSE-NEXT:    jns LBB31_16
+; SSE-NEXT:  LBB31_15: ## %cond.store13
+; SSE-NEXT:    movl %r10d, 28(%rdx)
+; SSE-NEXT:    testl $256, %edi ## imm = 0x100
+; SSE-NEXT:    je LBB31_18
+; SSE-NEXT:  LBB31_17: ## %cond.store15
+; SSE-NEXT:    movl %r9d, 32(%rdx)
+; SSE-NEXT:    testl $512, %edi ## imm = 0x200
+; SSE-NEXT:    je LBB31_20
+; SSE-NEXT:  LBB31_19: ## %cond.store17
+; SSE-NEXT:    movl %r8d, 36(%rdx)
+; SSE-NEXT:    testl $1024, %edi ## imm = 0x400
+; SSE-NEXT:    je LBB31_22
+; SSE-NEXT:  LBB31_21: ## %cond.store19
+; SSE-NEXT:    movl %ecx, 40(%rdx)
+; SSE-NEXT:    testl $2048, %edi ## imm = 0x800
+; SSE-NEXT:    je LBB31_24
+; SSE-NEXT:  LBB31_23: ## %cond.store21
+; SSE-NEXT:    movl %eax, 44(%rdx)
+; SSE-NEXT:    testl $4096, %edi ## imm = 0x1000
+; SSE-NEXT:    je LBB31_26
+; SSE-NEXT:  LBB31_25: ## %cond.store23
+; SSE-NEXT:    movl %r13d, 48(%rdx)
+; SSE-NEXT:    testl $8192, %edi ## imm = 0x2000
+; SSE-NEXT:    je LBB31_28
+; SSE-NEXT:  LBB31_27: ## %cond.store25
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 52(%rdx)
+; SSE-NEXT:    testl $16384, %edi ## imm = 0x4000
+; SSE-NEXT:    je LBB31_30
+; SSE-NEXT:  LBB31_29: ## %cond.store27
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 56(%rdx)
+; SSE-NEXT:    testw %di, %di
+; SSE-NEXT:    jns LBB31_32
+; SSE-NEXT:  LBB31_31: ## %cond.store29
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 60(%rdx)
+; SSE-NEXT:    testl $65536, %edi ## imm = 0x10000
+; SSE-NEXT:    je LBB31_34
+; SSE-NEXT:  LBB31_33: ## %cond.store31
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 64(%rdx)
+; SSE-NEXT:    testl $131072, %edi ## imm = 0x20000
+; SSE-NEXT:    je LBB31_36
+; SSE-NEXT:  LBB31_35: ## %cond.store33
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 68(%rdx)
+; SSE-NEXT:    testl $262144, %edi ## imm = 0x40000
+; SSE-NEXT:    je LBB31_38
+; SSE-NEXT:  LBB31_37: ## %cond.store35
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 72(%rdx)
+; SSE-NEXT:    testl $524288, %edi ## imm = 0x80000
+; SSE-NEXT:    je LBB31_40
+; SSE-NEXT:  LBB31_39: ## %cond.store37
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 76(%rdx)
+; SSE-NEXT:    testl $1048576, %edi ## imm = 0x100000
+; SSE-NEXT:    je LBB31_42
+; SSE-NEXT:  LBB31_41: ## %cond.store39
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 80(%rdx)
+; SSE-NEXT:    testl $2097152, %edi ## imm = 0x200000
+; SSE-NEXT:    je LBB31_44
+; SSE-NEXT:  LBB31_43: ## %cond.store41
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 4-byte Reload
+; SSE-NEXT:    movl %eax, 84(%rdx)
+; SSE-NEXT:    testl $4194304, %edi ## imm = 0x400000
+; SSE-NEXT:    jne LBB31_45
+; SSE-NEXT:    jmp LBB31_46
 ;
 ; AVX1-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
 ; AVX1:       ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 2610f4322c8e2b..62051d17099403 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1983,91 +1983,75 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movzwl 16(%eax), %edx
 ; X86-SSE-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SSE-NEXT:    movdqa (%eax), %xmm3
-; X86-SSE-NEXT:    movdqa (%ecx), %xmm0
-; X86-SSE-NEXT:    movdqa 16(%ecx), %xmm1
-; X86-SSE-NEXT:    pxor %xmm5, %xmm5
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSE-NEXT:    pextrw $7, %xmm3, %eax
-; X86-SSE-NEXT:    pextrw $4, %xmm3, %edi
-; X86-SSE-NEXT:    pextrw $0, %xmm3, %ebp
-; X86-SSE-NEXT:    pextrw $1, %xmm3, %esi
-; X86-SSE-NEXT:    pextrw $3, %xmm3, %ebx
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X86-SSE-NEXT:    movd %xmm3, %ecx
+; X86-SSE-NEXT:    movdqa (%eax), %xmm2
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE-NEXT:    pextrw $7, %xmm2, %eax
+; X86-SSE-NEXT:    pextrw $4, %xmm2, %esi
+; X86-SSE-NEXT:    pextrw $1, %xmm2, %edi
+; X86-SSE-NEXT:    pextrw $0, %xmm2, %ebx
+; X86-SSE-NEXT:    pextrw $3, %xmm2, %ebp
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl 28(%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-SSE-NEXT:    movd %xmm3, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
+; X86-SSE-NEXT:    divl 24(%ecx)
 ; X86-SSE-NEXT:    movd %edx, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm5, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm5, %ecx
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm5
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X86-SSE-NEXT:    divl 16(%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    movd %xmm0, %eax
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl 20(%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; X86-SSE-NEXT:    movl %edi, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    divl 16(%edi)
+; X86-SSE-NEXT:    divl 4(%ecx)
 ; X86-SSE-NEXT:    movd %edx, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X86-SSE-NEXT:    movd %xmm2, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE-NEXT:    movd %xmm1, %ecx
+; X86-SSE-NEXT:    movl %ebx, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm1
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; X86-SSE-NEXT:    divl (%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; X86-SSE-NEXT:    movl %ebp, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl (%edi)
-; X86-SSE-NEXT:    movd %edx, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-SSE-NEXT:    movd %xmm2, %ecx
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm2
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X86-SSE-NEXT:    movd %xmm2, %ecx
-; X86-SSE-NEXT:    movl %ebx, %eax
+; X86-SSE-NEXT:    divl 12(%ecx)
+; X86-SSE-NEXT:    movd %edx, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X86-SSE-NEXT:    movd %xmm2, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
+; X86-SSE-NEXT:    divl 8(%ecx)
 ; X86-SSE-NEXT:    movd %edx, %xmm2
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm4, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm0, %ecx
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; X86-SSE-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 32(%edi)
+; X86-SSE-NEXT:    divl 32(%ecx)
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm4
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-SSE-NEXT:    pmuludq %xmm2, %xmm3
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
 ; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X86-SSE-NEXT:    pmuludq %xmm2, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
 ; X86-SSE-NEXT:    movl %eax, (%eax)
-; X86-SSE-NEXT:    movdqa %xmm3, (%eax)
+; X86-SSE-NEXT:    movdqa %xmm1, (%eax)
 ; X86-SSE-NEXT:    movdqa %xmm0, (%eax)
 ; X86-SSE-NEXT:    addl $4, %esp
 ; X86-SSE-NEXT:    popl %esi
@@ -2204,91 +2188,76 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X64-SSE-LABEL: PR34947:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movzwl 16(%rdi), %ecx
-; X64-SSE-NEXT:    movdqa (%rdi), %xmm3
-; X64-SSE-NEXT:    movdqa (%rsi), %xmm0
-; X64-SSE-NEXT:    movdqa 16(%rsi), %xmm1
-; X64-SSE-NEXT:    pxor %xmm5, %xmm5
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm2
-; X64-SSE-NEXT:    pextrw $7, %xmm3, %eax
-; X64-SSE-NEXT:    pextrw $4, %xmm3, %r8d
-; X64-SSE-NEXT:    pextrw $0, %xmm3, %r10d
-; X64-SSE-NEXT:    pextrw $1, %xmm3, %edi
-; X64-SSE-NEXT:    pextrw $3, %xmm3, %r9d
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X64-SSE-NEXT:    movd %xmm3, %r11d
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %r11d
-; X64-SSE-NEXT:    movd %edx, %xmm3
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm5, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm5, %r11d
+; X64-SSE-NEXT:    movdqa (%rdi), %xmm2
+; X64-SSE-NEXT:    pxor %xmm1, %xmm1
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE-NEXT:    pextrw $7, %xmm2, %eax
+; X64-SSE-NEXT:    pextrw $4, %xmm2, %edi
+; X64-SSE-NEXT:    pextrw $1, %xmm2, %r8d
+; X64-SSE-NEXT:    pextrw $0, %xmm2, %r9d
+; X64-SSE-NEXT:    pextrw $3, %xmm2, %r10d
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %r11d
-; X64-SSE-NEXT:    movd %edx, %xmm5
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; X64-SSE-NEXT:    movl %r8d, %eax
+; X64-SSE-NEXT:    divl 28(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X64-SSE-NEXT:    movd %xmm3, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl 16(%rsi)
+; X64-SSE-NEXT:    divl 24(%rsi)
 ; X64-SSE-NEXT:    movd %edx, %xmm3
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X64-SSE-NEXT:    movd %xmm2, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X64-SSE-NEXT:    movd %xmm1, %r8d
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %r8d
-; X64-SSE-NEXT:    movd %edx, %xmm1
 ; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; X64-SSE-NEXT:    movl %r10d, %eax
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl (%rsi)
+; X64-SSE-NEXT:    divl 16(%rsi)
 ; X64-SSE-NEXT:    movd %edx, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X64-SSE-NEXT:    movd %xmm2, %r8d
-; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-SSE-NEXT:    movd %xmm0, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %r8d
-; X64-SSE-NEXT:    movd %edx, %xmm2
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X64-SSE-NEXT:    movd %xmm2, %edi
+; X64-SSE-NEXT:    divl 20(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm0
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; X64-SSE-NEXT:    movl %r8d, %eax
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl 4(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm0
 ; X64-SSE-NEXT:    movl %r9d, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %edi
-; X64-SSE-NEXT:    movd %edx, %xmm2
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm4, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm0, %edi
+; X64-SSE-NEXT:    divl (%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm3
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; X64-SSE-NEXT:    movl %r10d, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %edi
+; X64-SSE-NEXT:    divl 12(%rsi)
 ; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; X64-SSE-NEXT:    movd %xmm2, %eax
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    divl 8(%rsi)
+; X64-SSE-NEXT:    movd %edx, %xmm2
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; X64-SSE-NEXT:    movl %ecx, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl 32(%rsi)
 ; X64-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; X64-SSE-NEXT:    pmuludq %xmm0, %xmm3
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X64-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
 ; X64-SSE-NEXT:    movl %eax, (%rax)
-; X64-SSE-NEXT:    movdqa %xmm3, (%rax)
 ; X64-SSE-NEXT:    movdqa %xmm1, (%rax)
+; X64-SSE-NEXT:    movdqa %xmm3, (%rax)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: PR34947:

>From 91896607ffb84561a7a2e466a00fdf1938c5bb63 Mon Sep 17 00:00:00 2001
From: Brandon Wu <brandon.wu at sifive.com>
Date: Wed, 27 Mar 2024 23:03:13 +0800
Subject: [PATCH 31/54] [RISCV] RISCV vector calling convention (1/2) (#77560)

    [RISCV] RISCV vector calling convention (1/2)

    This is the vector calling convention based on
    https://github.com/riscv-non-isa/riscv-elf-psabi-doc,
    the idea is to split between "scalar" callee-saved registers
    and "vector" callee-saved registers. "scalar" ones remain the
    original strategy, however, "vector" ones are handled together
    with RVV objects.

    The stack layout would be:

      |--------------------------| <-- FP
      | callee-allocated save    |
      | area for register varargs|
      |--------------------------|
      | callee-saved registers   | <-- scalar callee-saved
      |        (scalar)          |
      |--------------------------|
      | RVV alignment padding    |
      |--------------------------|
      | callee-saved registers   | <-- vector callee-saved
      |        (vector)          |
      |--------------------------|
      | RVV objects              |
      |--------------------------|
      | padding before RVV       |
      |--------------------------|
      | scalar local variables   |
      |--------------------------| <-- BP
      | variable size objects    |
      |--------------------------| <-- SP

    Note: This patch doesn't contain "tuple" type, e.g. vint32m1x2.
          It will be handled in https://github.com/riscv-non-isa/riscv-elf-psabi-doc (2/2).

    Differential Revision: https://reviews.llvm.org/D154576
---
 clang/include/clang-c/Index.h                 |  1 +
 clang/include/clang/Basic/Attr.td             |  7 ++
 clang/include/clang/Basic/AttrDocs.td         | 11 +++
 clang/include/clang/Basic/Specifiers.h        | 43 ++++----
 clang/lib/AST/ItaniumMangle.cpp               |  1 +
 clang/lib/AST/Type.cpp                        |  4 +
 clang/lib/AST/TypePrinter.cpp                 |  6 ++
 clang/lib/Basic/Targets/RISCV.cpp             | 11 +++
 clang/lib/Basic/Targets/RISCV.h               |  2 +
 clang/lib/CodeGen/CGCall.cpp                  |  6 ++
 clang/lib/CodeGen/CGDebugInfo.cpp             |  2 +
 clang/lib/Sema/SemaDeclAttr.cpp               |  7 ++
 clang/lib/Sema/SemaType.cpp                   |  5 +-
 .../RISCV/riscv-vector-callingconv-llvm-ir.c  | 34 +++++++
 .../riscv-vector-callingconv-llvm-ir.cpp      | 32 ++++++
 .../CodeGen/RISCV/riscv-vector-callingconv.c  | 17 ++++
 .../RISCV/riscv-vector-callingconv.cpp        | 35 +++++++
 clang/tools/libclang/CXType.cpp               |  1 +
 llvm/include/llvm/AsmParser/LLToken.h         |  1 +
 llvm/include/llvm/BinaryFormat/Dwarf.def      |  1 +
 llvm/include/llvm/IR/CallingConv.h            |  3 +
 llvm/lib/AsmParser/LLLexer.cpp                |  1 +
 llvm/lib/AsmParser/LLParser.cpp               |  4 +
 llvm/lib/IR/AsmWriter.cpp                     |  3 +
 llvm/lib/Target/RISCV/RISCVCallingConv.td     | 13 +++
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  | 97 +++++++++++++------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  1 +
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   | 15 +++
 .../CodeGen/RISCV/rvv/callee-saved-regs.ll    | 95 ++++++++++++++++++
 29 files changed, 409 insertions(+), 50 deletions(-)
 create mode 100644 clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
 create mode 100644 clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
 create mode 100644 clang/test/CodeGen/RISCV/riscv-vector-callingconv.c
 create mode 100644 clang/test/CodeGen/RISCV/riscv-vector-callingconv.cpp
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll

diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 60db3cf0966c02..7a8bd985a91fc0 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2991,6 +2991,7 @@ enum CXCallingConv {
   CXCallingConv_AArch64SVEPCS = 18,
   CXCallingConv_M68kRTD = 19,
   CXCallingConv_PreserveNone = 20,
+  CXCallingConv_RISCVVectorCall = 21,
 
   CXCallingConv_Invalid = 100,
   CXCallingConv_Unexposed = 200
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 318d4e5ac5ba44..80e607525a0a37 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3011,6 +3011,13 @@ def PreserveNone : DeclOrTypeAttr, TargetSpecificAttr<TargetAnyX86> {
   let Documentation = [PreserveNoneDocs];
 }
 
+def RISCVVectorCC: DeclOrTypeAttr, TargetSpecificAttr<TargetRISCV> {
+ let Spellings = [CXX11<"riscv", "vector_cc">,
+                  C23<"riscv", "vector_cc">,
+                  Clang<"riscv_vector_cc">];
+ let Documentation = [RISCVVectorCCDocs];
+}
+
 def Target : InheritableAttr {
   let Spellings = [GCC<"target">];
   let Args = [StringArgument<"featuresStr">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 384aebbdf2e32a..3ea4d676b4f89d 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -5494,6 +5494,17 @@ for clang builtin functions.
   }];
 }
 
+def RISCVVectorCCDocs : Documentation {
+ let Category = DocCatCallingConvs;
+ let Heading = "riscv::vector_cc, riscv_vector_cc, clang::riscv_vector_cc";
+ let Content = [{
+The ``riscv_vector_cc`` attribute can be applied to a function. It preserves 15
+registers namely, v1-v7 and v24-v31 as callee-saved. Callers thus don't need
+to save these registers before function calls, and callees only need to save
+them if they use them.
+ }];
+}
+
 def PreferredNameDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{
diff --git a/clang/include/clang/Basic/Specifiers.h b/clang/include/clang/Basic/Specifiers.h
index 8586405825cfe0..fb11e8212f8b68 100644
--- a/clang/include/clang/Basic/Specifiers.h
+++ b/clang/include/clang/Basic/Specifiers.h
@@ -273,29 +273,30 @@ namespace clang {
 
   /// CallingConv - Specifies the calling convention that a function uses.
   enum CallingConv {
-    CC_C,           // __attribute__((cdecl))
-    CC_X86StdCall,  // __attribute__((stdcall))
-    CC_X86FastCall, // __attribute__((fastcall))
-    CC_X86ThisCall, // __attribute__((thiscall))
-    CC_X86VectorCall, // __attribute__((vectorcall))
-    CC_X86Pascal,   // __attribute__((pascal))
-    CC_Win64,       // __attribute__((ms_abi))
-    CC_X86_64SysV,  // __attribute__((sysv_abi))
-    CC_X86RegCall, // __attribute__((regcall))
-    CC_AAPCS,       // __attribute__((pcs("aapcs")))
-    CC_AAPCS_VFP,   // __attribute__((pcs("aapcs-vfp")))
-    CC_IntelOclBicc, // __attribute__((intel_ocl_bicc))
-    CC_SpirFunction, // default for OpenCL functions on SPIR target
-    CC_OpenCLKernel, // inferred for OpenCL kernels
-    CC_Swift,        // __attribute__((swiftcall))
+    CC_C,                 // __attribute__((cdecl))
+    CC_X86StdCall,        // __attribute__((stdcall))
+    CC_X86FastCall,       // __attribute__((fastcall))
+    CC_X86ThisCall,       // __attribute__((thiscall))
+    CC_X86VectorCall,     // __attribute__((vectorcall))
+    CC_X86Pascal,         // __attribute__((pascal))
+    CC_Win64,             // __attribute__((ms_abi))
+    CC_X86_64SysV,        // __attribute__((sysv_abi))
+    CC_X86RegCall,        // __attribute__((regcall))
+    CC_AAPCS,             // __attribute__((pcs("aapcs")))
+    CC_AAPCS_VFP,         // __attribute__((pcs("aapcs-vfp")))
+    CC_IntelOclBicc,      // __attribute__((intel_ocl_bicc))
+    CC_SpirFunction,      // default for OpenCL functions on SPIR target
+    CC_OpenCLKernel,      // inferred for OpenCL kernels
+    CC_Swift,             // __attribute__((swiftcall))
     CC_SwiftAsync,        // __attribute__((swiftasynccall))
-    CC_PreserveMost, // __attribute__((preserve_most))
-    CC_PreserveAll,  // __attribute__((preserve_all))
+    CC_PreserveMost,      // __attribute__((preserve_most))
+    CC_PreserveAll,       // __attribute__((preserve_all))
     CC_AArch64VectorCall, // __attribute__((aarch64_vector_pcs))
-    CC_AArch64SVEPCS, // __attribute__((aarch64_sve_pcs))
-    CC_AMDGPUKernelCall, // __attribute__((amdgpu_kernel))
-    CC_M68kRTD,       // __attribute__((m68k_rtd))
-    CC_PreserveNone,  // __attribute__((preserve_none))
+    CC_AArch64SVEPCS,     // __attribute__((aarch64_sve_pcs))
+    CC_AMDGPUKernelCall,  // __attribute__((amdgpu_kernel))
+    CC_M68kRTD,           // __attribute__((m68k_rtd))
+    CC_PreserveNone,      // __attribute__((preserve_none))
+    CC_RISCVVectorCall,   // __attribute__((riscv_vector_cc))
   };
 
   /// Checks whether the given calling convention supports variadic
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index f619d657ae9f50..425f84e8af1fe7 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3445,6 +3445,7 @@ StringRef CXXNameMangler::getCallingConvQualifierName(CallingConv CC) {
   case CC_PreserveAll:
   case CC_M68kRTD:
   case CC_PreserveNone:
+  case CC_RISCVVectorCall:
     // FIXME: we should be mangling all of the above.
     return "";
 
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index d2ffb23845acab..8f3e26d4601921 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3484,6 +3484,9 @@ StringRef FunctionType::getNameForCallConv(CallingConv CC) {
   case CC_PreserveAll: return "preserve_all";
   case CC_M68kRTD: return "m68k_rtd";
   case CC_PreserveNone: return "preserve_none";
+    // clang-format off
+  case CC_RISCVVectorCall: return "riscv_vector_cc";
+    // clang-format on
   }
 
   llvm_unreachable("Invalid calling convention.");
@@ -4074,6 +4077,7 @@ bool AttributedType::isCallingConv() const {
   case attr::PreserveAll:
   case attr::M68kRTD:
   case attr::PreserveNone:
+  case attr::RISCVVectorCC:
     return true;
   }
   llvm_unreachable("invalid attr kind");
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index f176d043d52521..0aa1d9327d7707 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1071,6 +1071,9 @@ void TypePrinter::printFunctionAfter(const FunctionType::ExtInfo &Info,
     case CC_PreserveNone:
       OS << " __attribute__((preserve_none))";
       break;
+    case CC_RISCVVectorCall:
+      OS << "__attribute__((riscv_vector_cc))";
+      break;
     }
   }
 
@@ -1960,6 +1963,9 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   case attr::PreserveNone:
     OS << "preserve_none";
     break;
+  case attr::RISCVVectorCC:
+    OS << "riscv_vector_cc";
+    break;
   case attr::NoDeref:
     OS << "noderef";
     break;
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index a6d4af2b88111a..f3d705e1551fe2 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -467,3 +467,14 @@ ParsedTargetAttr RISCVTargetInfo::parseTargetAttr(StringRef Features) const {
   }
   return Ret;
 }
+
+TargetInfo::CallingConvCheckResult
+RISCVTargetInfo::checkCallingConvention(CallingConv CC) const {
+  switch (CC) {
+  default:
+    return CCCR_Warning;
+  case CC_C:
+  case CC_RISCVVectorCall:
+    return CCCR_OK;
+  }
+}
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index bfbdafb682c851..78580b5b1c1063 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -110,6 +110,8 @@ class RISCVTargetInfo : public TargetInfo {
 
   bool hasBFloat16Type() const override { return true; }
 
+  CallingConvCheckResult checkCallingConvention(CallingConv CC) const override;
+
   bool useFP16ConversionIntrinsics() const override {
     return false;
   }
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 475d96b0e87d74..b8adf5c26b3a35 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -74,6 +74,9 @@ unsigned CodeGenTypes::ClangCallConvToLLVMCallConv(CallingConv CC) {
   case CC_SwiftAsync: return llvm::CallingConv::SwiftTail;
   case CC_M68kRTD: return llvm::CallingConv::M68k_RTD;
   case CC_PreserveNone: return llvm::CallingConv::PreserveNone;
+    // clang-format off
+  case CC_RISCVVectorCall: return llvm::CallingConv::RISCV_VectorCall;
+    // clang-format on
   }
 }
 
@@ -260,6 +263,9 @@ static CallingConv getCallingConventionForDecl(const ObjCMethodDecl *D,
   if (D->hasAttr<PreserveNoneAttr>())
     return CC_PreserveNone;
 
+  if (D->hasAttr<RISCVVectorCCAttr>())
+    return CC_RISCVVectorCall;
+
   return CC_C;
 }
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 0e20de2005b24b..2a385d85aa2bc3 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1452,6 +1452,8 @@ static unsigned getDwarfCC(CallingConv CC) {
     return llvm::dwarf::DW_CC_LLVM_M68kRTD;
   case CC_PreserveNone:
     return llvm::dwarf::DW_CC_LLVM_PreserveNone;
+  case CC_RISCVVectorCall:
+    return llvm::dwarf::DW_CC_LLVM_RISCVVectorCall;
   }
   return 0;
 }
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 0a62c656d824ff..f25f3afd0f4af2 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5271,6 +5271,9 @@ static void handleCallConvAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   case ParsedAttr::AT_PreserveNone:
     D->addAttr(::new (S.Context) PreserveNoneAttr(S.Context, AL));
     return;
+  case ParsedAttr::AT_RISCVVectorCC:
+    D->addAttr(::new (S.Context) RISCVVectorCCAttr(S.Context, AL));
+    return;
   default:
     llvm_unreachable("unexpected attribute kind");
   }
@@ -5475,6 +5478,9 @@ bool Sema::CheckCallingConvAttr(const ParsedAttr &Attrs, CallingConv &CC,
   case ParsedAttr::AT_PreserveNone:
     CC = CC_PreserveNone;
     break;
+  case ParsedAttr::AT_RISCVVectorCC:
+    CC = CC_RISCVVectorCall;
+    break;
   default: llvm_unreachable("unexpected attribute kind");
   }
 
@@ -9637,6 +9643,7 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUKernelCall:
   case ParsedAttr::AT_M68kRTD:
   case ParsedAttr::AT_PreserveNone:
+  case ParsedAttr::AT_RISCVVectorCC:
     handleCallConvAttr(S, D, AL);
     break;
   case ParsedAttr::AT_Suppress:
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index d7521a5363a3d2..fd94caa4e1d449 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -138,7 +138,8 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
   case ParsedAttr::AT_PreserveMost:                                            \
   case ParsedAttr::AT_PreserveAll:                                             \
   case ParsedAttr::AT_M68kRTD:                                                 \
-  case ParsedAttr::AT_PreserveNone
+  case ParsedAttr::AT_PreserveNone:                                            \
+  case ParsedAttr::AT_RISCVVectorCC
 
 // Function type attributes.
 #define FUNCTION_TYPE_ATTRS_CASELIST                                           \
@@ -7939,6 +7940,8 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) {
     return createSimpleAttr<M68kRTDAttr>(Ctx, Attr);
   case ParsedAttr::AT_PreserveNone:
     return createSimpleAttr<PreserveNoneAttr>(Ctx, Attr);
+  case ParsedAttr::AT_RISCVVectorCC:
+    return createSimpleAttr<RISCVVectorCCAttr>(Ctx, Attr);
   }
   llvm_unreachable("unexpected attribute kind!");
 }
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
new file mode 100644
index 00000000000000..072d8a863d4570
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
@@ -0,0 +1,34 @@
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-LLVM %s
+// RUN: %clang_cc1 -std=c23 -triple riscv64 -target-feature +v \
+// RUN:   -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-LLVM %s
+
+#include <riscv_vector.h>
+
+// CHECK-LLVM: call riscv_vector_cc <vscale x 2 x i32> @bar
+vint32m1_t __attribute__((riscv_vector_cc)) bar(vint32m1_t input);
+vint32m1_t test_vector_cc_attr(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = bar(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
+
+// CHECK-LLVM: call riscv_vector_cc <vscale x 2 x i32> @bar
+[[riscv::vector_cc]] vint32m1_t bar(vint32m1_t input);
+vint32m1_t test_vector_cc_attr2(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = bar(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
+
+// CHECK-LLVM: call <vscale x 2 x i32> @baz
+vint32m1_t baz(vint32m1_t input);
+vint32m1_t test_no_vector_cc_attr(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = baz(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
new file mode 100644
index 00000000000000..c01aeb21f67571
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
@@ -0,0 +1,32 @@
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -std=c++11 -triple riscv64 -target-feature +v \
+// RUN:   -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-LLVM %s
+
+#include <riscv_vector.h>
+
+// CHECK-LLVM: call riscv_vector_cc <vscale x 2 x i32> @_Z3baru15__rvv_int32m1_t
+vint32m1_t __attribute__((riscv_vector_cc)) bar(vint32m1_t input);
+vint32m1_t test_vector_cc_attr(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = bar(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
+
+// CHECK-LLVM: call riscv_vector_cc <vscale x 2 x i32> @_Z3baru15__rvv_int32m1_t
+[[riscv::vector_cc]] vint32m1_t bar(vint32m1_t input);
+vint32m1_t test_vector_cc_attr2(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = bar(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
+
+// CHECK-LLVM: call <vscale x 2 x i32> @_Z3bazu15__rvv_int32m1_t
+vint32m1_t baz(vint32m1_t input);
+vint32m1_t test_no_vector_cc_attr(vint32m1_t input, int32_t *base, size_t vl) {
+  vint32m1_t val = __riscv_vle32_v_i32m1(base, vl);
+  vint32m1_t ret = baz(input);
+  __riscv_vse32_v_i32m1(base, val, vl);
+  return ret;
+}
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv.c b/clang/test/CodeGen/RISCV/riscv-vector-callingconv.c
new file mode 100644
index 00000000000000..5c35901799b427
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 %s -std=c23 -triple riscv64 -target-feature +v -verify
+
+__attribute__((riscv_vector_cc)) int var; // expected-warning {{'riscv_vector_cc' only applies to function types; type here is 'int'}}
+
+__attribute__((riscv_vector_cc)) void func();
+__attribute__((riscv_vector_cc(1))) void func_invalid(); // expected-error {{'riscv_vector_cc' attribute takes no arguments}}
+
+void test_no_attribute(int); // expected-note {{previous declaration is here}}
+void __attribute__((riscv_vector_cc)) test_no_attribute(int x) { } // expected-error {{function declared 'riscv_vector_cc' here was previously declared without calling convention}}
+
+[[riscv::vector_cc]] int var2; // expected-warning {{'vector_cc' only applies to function types; type here is 'int'}}
+
+[[riscv::vector_cc]] void func2();
+[[riscv::vector_cc(1)]] void func_invalid2(); // expected-error {{'vector_cc' attribute takes no arguments}}
+
+void test_no_attribute2(int); // expected-note {{previous declaration is here}}
+[[riscv::vector_cc]] void test_no_attribute2(int x) { } // expected-error {{function declared 'riscv_vector_cc' here was previously declared without calling convention}}
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv.cpp b/clang/test/CodeGen/RISCV/riscv-vector-callingconv.cpp
new file mode 100644
index 00000000000000..264bb7d9ad7c00
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -triple riscv64 -target-feature +v -verify
+
+__attribute__((riscv_vector_cc)) int var; // expected-warning {{'riscv_vector_cc' only applies to function types; type here is 'int'}}
+
+__attribute__((riscv_vector_cc)) void func();
+__attribute__((riscv_vector_cc(1))) void func_invalid(); // expected-error {{'riscv_vector_cc' attribute takes no arguments}}
+
+void test_no_attribute(int); // expected-note {{previous declaration is here}}
+void __attribute__((riscv_vector_cc)) test_no_attribute(int x) { } // expected-error {{function declared 'riscv_vector_cc' here was previously declared without calling convention}}
+
+class test_cc {
+  __attribute__((riscv_vector_cc)) void member_func();
+};
+
+void test_lambda() {
+  __attribute__((riscv_vector_cc)) auto lambda = []() { // expected-warning {{'riscv_vector_cc' only applies to function types; type here is 'auto'}}
+  };
+}
+
+[[riscv::vector_cc]] int var2; // expected-warning {{'vector_cc' only applies to function types; type here is 'int'}}
+
+[[riscv::vector_cc]] void func2();
+[[riscv::vector_cc(1)]] void func_invalid2(); // expected-error {{'vector_cc' attribute takes no arguments}}
+
+void test_no_attribute2(int); // expected-note {{previous declaration is here}}
+[[riscv::vector_cc]] void test_no_attribute2(int x) { } // expected-error {{function declared 'riscv_vector_cc' here was previously declared without calling convention}}
+
+class test_cc2 {
+  [[riscv::vector_cc]] void member_func();
+};
+
+void test_lambda2() {
+  [[riscv::vector_cc]] auto lambda = []() { // expected-warning {{'vector_cc' only applies to function types; type here is 'auto'}}
+  };
+}
diff --git a/clang/tools/libclang/CXType.cpp b/clang/tools/libclang/CXType.cpp
index 292d524f00abd6..991767dc4c49c6 100644
--- a/clang/tools/libclang/CXType.cpp
+++ b/clang/tools/libclang/CXType.cpp
@@ -680,6 +680,7 @@ CXCallingConv clang_getFunctionTypeCallingConv(CXType X) {
       TCALLINGCONV(PreserveAll);
       TCALLINGCONV(M68kRTD);
       TCALLINGCONV(PreserveNone);
+      TCALLINGCONV(RISCVVectorCall);
     case CC_SpirFunction: return CXCallingConv_Unexposed;
     case CC_AMDGPUKernelCall: return CXCallingConv_Unexposed;
     case CC_OpenCLKernel: return CXCallingConv_Unexposed;
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index 5863a8d6e8ee84..65ccb1b81b3a87 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -181,6 +181,7 @@ enum Kind {
   kw_tailcc,
   kw_m68k_rtdcc,
   kw_graalcc,
+  kw_riscv_vector_cc,
 
   // Attributes:
   kw_attributes,
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.def b/llvm/include/llvm/BinaryFormat/Dwarf.def
index e70b58d5ea50fc..d8927c6202fd57 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.def
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.def
@@ -1040,6 +1040,7 @@ HANDLE_DW_CC(0xca, LLVM_PreserveAll)
 HANDLE_DW_CC(0xcb, LLVM_X86RegCall)
 HANDLE_DW_CC(0xcc, LLVM_M68kRTD)
 HANDLE_DW_CC(0xcd, LLVM_PreserveNone)
+HANDLE_DW_CC(0xce, LLVM_RISCVVectorCall)
 // From GCC source code (include/dwarf2.h): This DW_CC_ value is not currently
 // generated by any toolchain.  It is used internally to GDB to indicate OpenCL
 // C functions that have been compiled with the IBM XL C for OpenCL compiler and
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index ef8aaf52f4e6ac..a05d1a4d587845 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -264,6 +264,9 @@ namespace CallingConv {
     /// except that the first parameter is mapped to x9.
     ARM64EC_Thunk_Native = 109,
 
+    /// Calling convention used for RISC-V V-extension.
+    RISCV_VectorCall = 110,
+
     /// The highest possible ID. Must be some 2^k - 1.
     MaxID = 1023
   };
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 02f64fcfac4f0c..2301a27731eaff 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -640,6 +640,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(tailcc);
   KEYWORD(m68k_rtdcc);
   KEYWORD(graalcc);
+  KEYWORD(riscv_vector_cc);
 
   KEYWORD(cc);
   KEYWORD(c);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index f0be021668afa7..41d48e5226203c 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2143,6 +2143,7 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'tailcc'
 ///   ::= 'm68k_rtdcc'
 ///   ::= 'graalcc'
+///   ::= 'riscv_vector_cc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::parseOptionalCallingConv(unsigned &CC) {
@@ -2213,6 +2214,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_tailcc:         CC = CallingConv::Tail; break;
   case lltok::kw_m68k_rtdcc:     CC = CallingConv::M68k_RTD; break;
   case lltok::kw_graalcc:        CC = CallingConv::GRAAL; break;
+  case lltok::kw_riscv_vector_cc:
+    CC = CallingConv::RISCV_VectorCall;
+    break;
   case lltok::kw_cc: {
       Lex.Lex();
       return parseUInt32(CC);
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 38c191a2dec60e..84690f02613902 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -363,6 +363,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
   case CallingConv::AMDGPU_Gfx:    Out << "amdgpu_gfx"; break;
   case CallingConv::M68k_RTD:      Out << "m68k_rtdcc"; break;
+  case CallingConv::RISCV_VectorCall:
+    Out << "riscv_vector_cc";
+    break;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index 11b716f20f3716..ad06f477437702 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -26,6 +26,19 @@ def CSR_ILP32D_LP64D
     : CalleeSavedRegs<(add CSR_ILP32_LP64,
                        F8_D, F9_D, (sequence "F%u_D", 18, 27))>;
 
+defvar CSR_V = (add (sequence "V%u", 1, 7), (sequence "V%u", 24, 31),
+                     V2M2, V4M2, V6M2, V24M2, V26M2, V28M2, V30M2,
+                     V4M4, V24M4, V28M4, V24M8);
+
+def CSR_ILP32_LP64_V
+    : CalleeSavedRegs<(add CSR_ILP32_LP64, CSR_V)>;
+
+def CSR_ILP32F_LP64F_V
+    : CalleeSavedRegs<(add CSR_ILP32F_LP64F, CSR_V)>;
+
+def CSR_ILP32D_LP64D_V
+    : CalleeSavedRegs<(add CSR_ILP32D_LP64D, CSR_V)>;
+
 // Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 39f2b3f62a9a0c..39075c81b2921f 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -388,6 +388,21 @@ getUnmanagedCSI(const MachineFunction &MF,
   return NonLibcallCSI;
 }
 
+static SmallVector<CalleeSavedInfo, 8>
+getRVVCalleeSavedInfo(const MachineFunction &MF,
+                      const std::vector<CalleeSavedInfo> &CSI) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  SmallVector<CalleeSavedInfo, 8> RVVCSI;
+
+  for (auto &CS : CSI) {
+    int FI = CS.getFrameIdx();
+    if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::ScalableVector)
+      RVVCSI.push_back(CS);
+  }
+
+  return RVVCSI;
+}
+
 void RISCVFrameLowering::adjustStackForRVV(MachineFunction &MF,
                                            MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MBBI,
@@ -590,6 +605,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   // directives.
   for (const auto &Entry : CSI) {
     int FrameIdx = Entry.getFrameIdx();
+    if (FrameIdx >= 0 &&
+        MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
+      continue;
+
     int64_t Offset = MFI.getObjectOffset(FrameIdx);
     Register Reg = Entry.getReg();
     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
@@ -726,7 +745,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
 
   const auto &CSI = getUnmanagedCSI(MF, MFI.getCalleeSavedInfo());
 
-  // Skip to before the restores of callee-saved registers
+  // Skip to before the restores of scalar callee-saved registers
   // FIXME: assumes exactly one instruction is used to restore each
   // callee-saved register.
   auto LastFrameDestroy = MBBI;
@@ -1029,15 +1048,24 @@ RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFunction &MF) const {
   MachineFrameInfo &MFI = MF.getFrameInfo();
   // Create a buffer of RVV objects to allocate.
   SmallVector<int, 8> ObjectsToAllocate;
-  for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
-    unsigned StackID = MFI.getStackID(I);
-    if (StackID != TargetStackID::ScalableVector)
-      continue;
-    if (MFI.isDeadObjectIndex(I))
-      continue;
+  auto pushRVVObjects = [&](int FIBegin, int FIEnd) {
+    for (int I = FIBegin, E = FIEnd; I != E; ++I) {
+      unsigned StackID = MFI.getStackID(I);
+      if (StackID != TargetStackID::ScalableVector)
+        continue;
+      if (MFI.isDeadObjectIndex(I))
+        continue;
 
-    ObjectsToAllocate.push_back(I);
-  }
+      ObjectsToAllocate.push_back(I);
+    }
+  };
+  // First push RVV Callee Saved object, then push RVV stack object
+  std::vector<CalleeSavedInfo> &CSI = MF.getFrameInfo().getCalleeSavedInfo();
+  const auto &RVVCSI = getRVVCalleeSavedInfo(MF, CSI);
+  if (!RVVCSI.empty())
+    pushRVVObjects(RVVCSI[0].getFrameIdx(),
+                   RVVCSI[RVVCSI.size() - 1].getFrameIdx() + 1);
+  pushRVVObjects(0, MFI.getObjectIndexEnd() - RVVCSI.size());
 
   // The minimum alignment is 16 bytes.
   Align RVVStackAlign(16);
@@ -1487,13 +1515,19 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
 
   // Manually spill values not spilled by libcall & Push/Pop.
   const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI);
-  for (auto &CS : UnmanagedCSI) {
-    // Insert the spill to the stack frame.
-    Register Reg = CS.getReg();
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), CS.getFrameIdx(),
-                            RC, TRI, Register());
-  }
+  const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI);
+
+  auto storeRegToStackSlot = [&](decltype(UnmanagedCSI) CSInfo) {
+    for (auto &CS : CSInfo) {
+      // Insert the spill to the stack frame.
+      Register Reg = CS.getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg),
+                              CS.getFrameIdx(), RC, TRI, Register());
+    }
+  };
+  storeRegToStackSlot(UnmanagedCSI);
+  storeRegToStackSlot(RVVCSI);
 
   return true;
 }
@@ -1511,19 +1545,26 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
     DL = MI->getDebugLoc();
 
   // Manually restore values not restored by libcall & Push/Pop.
-  // Keep the same order as in the prologue. There is no need to reverse the
-  // order in the epilogue. In addition, the return address will be restored
-  // first in the epilogue. It increases the opportunity to avoid the
-  // load-to-use data hazard between loading RA and return by RA.
-  // loadRegFromStackSlot can insert multiple instructions.
+  // Reverse the restore order in epilog.  In addition, the return
+  // address will be restored first in the epilogue. It increases
+  // the opportunity to avoid the load-to-use data hazard between
+  // loading RA and return by RA.  loadRegFromStackSlot can insert
+  // multiple instructions.
   const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI);
-  for (auto &CS : UnmanagedCSI) {
-    Register Reg = CS.getReg();
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
-                             Register());
-    assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
-  }
+  const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI);
+
+  auto loadRegFromStackSlot = [&](decltype(UnmanagedCSI) CSInfo) {
+    for (auto &CS : CSInfo) {
+      Register Reg = CS.getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
+                               Register());
+      assert(MI != MBB.begin() &&
+             "loadRegFromStackSlot didn't insert any code!");
+    }
+  };
+  loadRegFromStackSlot(RVVCSI);
+  loadRegFromStackSlot(UnmanagedCSI);
 
   RISCVMachineFunctionInfo *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
   if (RVFI->isPushable(*MF)) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ca78648c6aa9d8..564fda674317f4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18724,6 +18724,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
   case CallingConv::Fast:
   case CallingConv::SPIR_KERNEL:
   case CallingConv::GRAAL:
+  case CallingConv::RISCV_VectorCall:
     break;
   case CallingConv::GHC:
     if (Subtarget.isRVE())
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 74d65324b95d86..11c3f2d57eb00f 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -71,6 +71,9 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
                              : CSR_Interrupt_SaveList;
   }
 
+  bool HasVectorCSR =
+      MF->getFunction().getCallingConv() == CallingConv::RISCV_VectorCall;
+
   switch (Subtarget.getTargetABI()) {
   default:
     llvm_unreachable("Unrecognized ABI");
@@ -79,12 +82,18 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_ILP32E_LP64E_SaveList;
   case RISCVABI::ABI_ILP32:
   case RISCVABI::ABI_LP64:
+    if (HasVectorCSR)
+      return CSR_ILP32_LP64_V_SaveList;
     return CSR_ILP32_LP64_SaveList;
   case RISCVABI::ABI_ILP32F:
   case RISCVABI::ABI_LP64F:
+    if (HasVectorCSR)
+      return CSR_ILP32F_LP64F_V_SaveList;
     return CSR_ILP32F_LP64F_SaveList;
   case RISCVABI::ABI_ILP32D:
   case RISCVABI::ABI_LP64D:
+    if (HasVectorCSR)
+      return CSR_ILP32D_LP64D_V_SaveList;
     return CSR_ILP32D_LP64D_SaveList;
   }
 }
@@ -665,12 +674,18 @@ RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
     return CSR_ILP32E_LP64E_RegMask;
   case RISCVABI::ABI_ILP32:
   case RISCVABI::ABI_LP64:
+    if (CC == CallingConv::RISCV_VectorCall)
+      return CSR_ILP32_LP64_V_RegMask;
     return CSR_ILP32_LP64_RegMask;
   case RISCVABI::ABI_ILP32F:
   case RISCVABI::ABI_LP64F:
+    if (CC == CallingConv::RISCV_VectorCall)
+      return CSR_ILP32F_LP64F_V_RegMask;
     return CSR_ILP32F_LP64F_RegMask;
   case RISCVABI::ABI_ILP32D:
   case RISCVABI::ABI_LP64D:
+    if (CC == CallingConv::RISCV_VectorCall)
+      return CSR_ILP32D_LP64D_V_RegMask;
     return CSR_ILP32D_LP64D_RegMask;
   }
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
new file mode 100644
index 00000000000000..84936d88e1874f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m -mattr=+v -O2 < %s \
+; RUN:    | FileCheck --check-prefix=SPILL-O2 %s
+
+define <vscale x 1 x i32> @test_vector_std(<vscale x 1 x i32> %va) nounwind {
+; SPILL-O2-LABEL: test_vector_std:
+; SPILL-O2:       # %bb.0: # %entry
+; SPILL-O2-NEXT:    addi sp, sp, -16
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 1
+; SPILL-O2-NEXT:    sub sp, sp, a0
+; SPILL-O2-NEXT:    addi a0, sp, 16
+; SPILL-O2-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    #APP
+; SPILL-O2-NEXT:    #NO_APP
+; SPILL-O2-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 1
+; SPILL-O2-NEXT:    add sp, sp, a0
+; SPILL-O2-NEXT:    addi sp, sp, 16
+; SPILL-O2-NEXT:    ret
+entry:
+  call void asm sideeffect "",
+  "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+
+  ret <vscale x 1 x i32> %va
+}
+
+define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee(<vscale x 1 x i32> %va) nounwind {
+; SPILL-O2-LABEL: test_vector_callee:
+; SPILL-O2:       # %bb.0: # %entry
+; SPILL-O2-NEXT:    addi sp, sp, -16
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 4
+; SPILL-O2-NEXT:    sub sp, sp, a0
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 4
+; SPILL-O2-NEXT:    sub a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs1r.v v1, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 13
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs2r.v v2, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 3
+; SPILL-O2-NEXT:    add a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    addi a0, sp, 16
+; SPILL-O2-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; SPILL-O2-NEXT:    #APP
+; SPILL-O2-NEXT:    #NO_APP
+; SPILL-O2-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 4
+; SPILL-O2-NEXT:    sub a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl1r.v v1, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 13
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl2r.v v2, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 3
+; SPILL-O2-NEXT:    add a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 4
+; SPILL-O2-NEXT:    add sp, sp, a0
+; SPILL-O2-NEXT:    addi sp, sp, 16
+; SPILL-O2-NEXT:    ret
+entry:
+  call void asm sideeffect "",
+  "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+
+  ret <vscale x 1 x i32> %va
+}

>From 58de1e2c5eee548a9b365e3b1554d87317072ad9 Mon Sep 17 00:00:00 2001
From: Wesley Wiser <wwiser at gmail.com>
Date: Wed, 27 Mar 2024 15:05:58 +0000
Subject: [PATCH 32/54] Fix stack layout for frames larger than 2gb (#84114)

For very large stack frames, the offset from the stack pointer to a local can be more than 2^31 which overflows various `int` offsets in the frame lowering code.

This patch updates the frame lowering code to calculate the offsets as 64-bit values and resolves the overflows, resulting in the correct codegen for very large frames.

Fixes #48911
---
 llvm/include/llvm/CodeGen/MachineFrameInfo.h  | 14 +++----
 .../llvm/CodeGen/TargetFrameLowering.h        |  4 +-
 llvm/include/llvm/MC/MCAsmBackend.h           |  2 +-
 llvm/include/llvm/MC/MCDwarf.h                | 40 +++++++++----------
 llvm/lib/CodeGen/CFIInstrInserter.cpp         | 10 ++---
 llvm/lib/CodeGen/MachineFrameInfo.cpp         |  2 +-
 llvm/lib/CodeGen/PrologEpilogInserter.cpp     |  4 +-
 llvm/lib/MC/MCDwarf.cpp                       |  6 +--
 .../MCTargetDesc/AArch64AsmBackend.cpp        | 10 ++---
 llvm/lib/Target/ARM/ARMFrameLowering.cpp      |  4 +-
 .../Target/ARM/MCTargetDesc/ARMAsmBackend.cpp |  2 +-
 .../ARM/MCTargetDesc/ARMAsmBackendDarwin.h    |  2 +-
 .../Target/Hexagon/HexagonFrameLowering.cpp   |  4 +-
 .../lib/Target/MSP430/MSP430FrameLowering.cpp |  2 +-
 .../Target/X86/MCTargetDesc/X86AsmBackend.cpp | 13 +++---
 .../X86/MCTargetDesc/X86MCCodeEmitter.cpp     | 14 ++++---
 llvm/lib/Target/X86/X86FrameLowering.cpp      | 28 ++++++-------
 llvm/lib/Target/X86/X86FrameLowering.h        |  5 ++-
 llvm/lib/Target/X86/X86RegisterInfo.cpp       | 10 +++--
 llvm/test/CodeGen/PowerPC/huge-frame-size.ll  |  2 +-
 llvm/test/CodeGen/X86/huge-stack.ll           | 24 +++++++++++
 21 files changed, 116 insertions(+), 86 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/huge-stack.ll

diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 0fe73fec7ee67f..ad6142b46515bf 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -251,7 +251,7 @@ class MachineFrameInfo {
   /// targets, this value is only used when generating debug info (via
   /// TargetRegisterInfo::getFrameIndexReference); when generating code, the
   /// corresponding adjustments are performed directly.
-  int OffsetAdjustment = 0;
+  int64_t OffsetAdjustment = 0;
 
   /// The prolog/epilog code inserter may process objects that require greater
   /// alignment than the default alignment the target provides.
@@ -280,7 +280,7 @@ class MachineFrameInfo {
   /// setup/destroy pseudo instructions (as defined in the TargetFrameInfo
   /// class).  This information is important for frame pointer elimination.
   /// It is only valid during and after prolog/epilog code insertion.
-  unsigned MaxCallFrameSize = ~0u;
+  uint64_t MaxCallFrameSize = ~UINT64_C(0);
 
   /// The number of bytes of callee saved registers that the target wants to
   /// report for the current function in the CodeView S_FRAMEPROC record.
@@ -591,10 +591,10 @@ class MachineFrameInfo {
   uint64_t estimateStackSize(const MachineFunction &MF) const;
 
   /// Return the correction for frame offsets.
-  int getOffsetAdjustment() const { return OffsetAdjustment; }
+  int64_t getOffsetAdjustment() const { return OffsetAdjustment; }
 
   /// Set the correction for frame offsets.
-  void setOffsetAdjustment(int Adj) { OffsetAdjustment = Adj; }
+  void setOffsetAdjustment(int64_t Adj) { OffsetAdjustment = Adj; }
 
   /// Return the alignment in bytes that this function must be aligned to,
   /// which is greater than the default stack alignment provided by the target.
@@ -655,7 +655,7 @@ class MachineFrameInfo {
   /// CallFrameSetup/Destroy pseudo instructions are used by the target, and
   /// then only during or after prolog/epilog code insertion.
   ///
-  unsigned getMaxCallFrameSize() const {
+  uint64_t getMaxCallFrameSize() const {
     // TODO: Enable this assert when targets are fixed.
     //assert(isMaxCallFrameSizeComputed() && "MaxCallFrameSize not computed yet");
     if (!isMaxCallFrameSizeComputed())
@@ -663,9 +663,9 @@ class MachineFrameInfo {
     return MaxCallFrameSize;
   }
   bool isMaxCallFrameSizeComputed() const {
-    return MaxCallFrameSize != ~0u;
+    return MaxCallFrameSize != ~UINT64_C(0);
   }
-  void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
+  void setMaxCallFrameSize(uint64_t S) { MaxCallFrameSize = S; }
 
   /// Returns how many bytes of callee-saved registers the target pushed in the
   /// prologue. Only used for debug info.
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 0b9cacecc7cbe1..72978b2f746d76 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -51,7 +51,7 @@ class TargetFrameLowering {
   // Maps a callee saved register to a stack slot with a fixed offset.
   struct SpillSlot {
     unsigned Reg;
-    int Offset; // Offset relative to stack pointer on function entry.
+    int64_t Offset; // Offset relative to stack pointer on function entry.
   };
 
   struct DwarfFrameBase {
@@ -66,7 +66,7 @@ class TargetFrameLowering {
       // Used with FrameBaseKind::Register.
       unsigned Reg;
       // Used with FrameBaseKind::CFA.
-      int Offset;
+      int64_t Offset;
       struct WasmFrameBase WasmLoc;
     } Location;
   };
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 01a64fb425a94f..689e3cd5dbf206 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -232,7 +232,7 @@ class MCAsmBackend {
   virtual void handleAssemblerFlag(MCAssemblerFlag Flag) {}
 
   /// Generate the compact unwind encoding for the CFI instructions.
-  virtual uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  virtual uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                                  const MCContext *Ctxt) const {
     return 0;
   }
diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h
index 18056c5fdf816a..150b48eedc3715 100644
--- a/llvm/include/llvm/MC/MCDwarf.h
+++ b/llvm/include/llvm/MC/MCDwarf.h
@@ -508,7 +508,7 @@ class MCCFIInstruction {
   MCSymbol *Label;
   unsigned Register;
   union {
-    int Offset;
+    int64_t Offset;
     unsigned Register2;
   };
   unsigned AddressSpace = ~0u;
@@ -516,7 +516,7 @@ class MCCFIInstruction {
   std::vector<char> Values;
   std::string Comment;
 
-  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, SMLoc Loc,
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int64_t O, SMLoc Loc,
                    StringRef V = "", StringRef Comment = "")
       : Operation(Op), Label(L), Register(R), Offset(O), Loc(Loc),
         Values(V.begin(), V.end()), Comment(Comment) {
@@ -528,7 +528,7 @@ class MCCFIInstruction {
     assert(Op == OpRegister);
   }
 
-  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, unsigned AS,
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int64_t O, unsigned AS,
                    SMLoc Loc)
       : Operation(Op), Label(L), Register(R), Offset(O), AddressSpace(AS),
         Loc(Loc) {
@@ -538,8 +538,8 @@ class MCCFIInstruction {
 public:
   /// .cfi_def_cfa defines a rule for computing CFA as: take address from
   /// Register and add Offset to it.
-  static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int Offset,
-                                    SMLoc Loc = {}) {
+  static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register,
+                                    int64_t Offset, SMLoc Loc = {}) {
     return MCCFIInstruction(OpDefCfa, L, Register, Offset, Loc);
   }
 
@@ -547,13 +547,13 @@ class MCCFIInstruction {
   /// on Register will be used instead of the old one. Offset remains the same.
   static MCCFIInstruction createDefCfaRegister(MCSymbol *L, unsigned Register,
                                                SMLoc Loc = {}) {
-    return MCCFIInstruction(OpDefCfaRegister, L, Register, 0, Loc);
+    return MCCFIInstruction(OpDefCfaRegister, L, Register, INT64_C(0), Loc);
   }
 
   /// .cfi_def_cfa_offset modifies a rule for computing CFA. Register
   /// remains the same, but offset is new. Note that it is the absolute offset
   /// that will be added to a defined register to the compute CFA address.
-  static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int Offset,
+  static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset,
                                           SMLoc Loc = {}) {
     return MCCFIInstruction(OpDefCfaOffset, L, 0, Offset, Loc);
   }
@@ -561,7 +561,7 @@ class MCCFIInstruction {
   /// .cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but
   /// Offset is a relative value that is added/subtracted from the previous
   /// offset.
-  static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int Adjustment,
+  static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment,
                                                 SMLoc Loc = {}) {
     return MCCFIInstruction(OpAdjustCfaOffset, L, 0, Adjustment, Loc);
   }
@@ -581,7 +581,7 @@ class MCCFIInstruction {
   /// .cfi_offset Previous value of Register is saved at offset Offset
   /// from CFA.
   static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register,
-                                       int Offset, SMLoc Loc = {}) {
+                                       int64_t Offset, SMLoc Loc = {}) {
     return MCCFIInstruction(OpOffset, L, Register, Offset, Loc);
   }
 
@@ -589,7 +589,7 @@ class MCCFIInstruction {
   /// Offset from the current CFA register. This is transformed to .cfi_offset
   /// using the known displacement of the CFA register from the CFA.
   static MCCFIInstruction createRelOffset(MCSymbol *L, unsigned Register,
-                                          int Offset, SMLoc Loc = {}) {
+                                          int64_t Offset, SMLoc Loc = {}) {
     return MCCFIInstruction(OpRelOffset, L, Register, Offset, Loc);
   }
 
@@ -602,12 +602,12 @@ class MCCFIInstruction {
 
   /// .cfi_window_save SPARC register window is saved.
   static MCCFIInstruction createWindowSave(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpWindowSave, L, 0, 0, Loc);
+    return MCCFIInstruction(OpWindowSave, L, 0, INT64_C(0), Loc);
   }
 
   /// .cfi_negate_ra_state AArch64 negate RA state.
   static MCCFIInstruction createNegateRAState(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpNegateRAState, L, 0, 0, Loc);
+    return MCCFIInstruction(OpNegateRAState, L, 0, INT64_C(0), Loc);
   }
 
   /// .cfi_restore says that the rule for Register is now the same as it
@@ -615,31 +615,31 @@ class MCCFIInstruction {
   /// by .cfi_startproc were executed.
   static MCCFIInstruction createRestore(MCSymbol *L, unsigned Register,
                                         SMLoc Loc = {}) {
-    return MCCFIInstruction(OpRestore, L, Register, 0, Loc);
+    return MCCFIInstruction(OpRestore, L, Register, INT64_C(0), Loc);
   }
 
   /// .cfi_undefined From now on the previous value of Register can't be
   /// restored anymore.
   static MCCFIInstruction createUndefined(MCSymbol *L, unsigned Register,
                                           SMLoc Loc = {}) {
-    return MCCFIInstruction(OpUndefined, L, Register, 0, Loc);
+    return MCCFIInstruction(OpUndefined, L, Register, INT64_C(0), Loc);
   }
 
   /// .cfi_same_value Current value of Register is the same as in the
   /// previous frame. I.e., no restoration is needed.
   static MCCFIInstruction createSameValue(MCSymbol *L, unsigned Register,
                                           SMLoc Loc = {}) {
-    return MCCFIInstruction(OpSameValue, L, Register, 0, Loc);
+    return MCCFIInstruction(OpSameValue, L, Register, INT64_C(0), Loc);
   }
 
   /// .cfi_remember_state Save all current rules for all registers.
   static MCCFIInstruction createRememberState(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpRememberState, L, 0, 0, Loc);
+    return MCCFIInstruction(OpRememberState, L, 0, INT64_C(0), Loc);
   }
 
   /// .cfi_restore_state Restore the previously saved state.
   static MCCFIInstruction createRestoreState(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpRestoreState, L, 0, 0, Loc);
+    return MCCFIInstruction(OpRestoreState, L, 0, INT64_C(0), Loc);
   }
 
   /// .cfi_escape Allows the user to add arbitrary bytes to the unwind
@@ -650,7 +650,7 @@ class MCCFIInstruction {
   }
 
   /// A special wrapper for .cfi_escape that indicates GNU_ARGS_SIZE
-  static MCCFIInstruction createGnuArgsSize(MCSymbol *L, int Size,
+  static MCCFIInstruction createGnuArgsSize(MCSymbol *L, int64_t Size,
                                             SMLoc Loc = {}) {
     return MCCFIInstruction(OpGnuArgsSize, L, 0, Size, Loc);
   }
@@ -677,7 +677,7 @@ class MCCFIInstruction {
     return AddressSpace;
   }
 
-  int getOffset() const {
+  int64_t getOffset() const {
     assert(Operation == OpDefCfa || Operation == OpOffset ||
            Operation == OpRelOffset || Operation == OpDefCfaOffset ||
            Operation == OpAdjustCfaOffset || Operation == OpGnuArgsSize ||
@@ -705,7 +705,7 @@ struct MCDwarfFrameInfo {
   unsigned CurrentCfaRegister = 0;
   unsigned PersonalityEncoding = 0;
   unsigned LsdaEncoding = 0;
-  uint32_t CompactUnwindEncoding = 0;
+  uint64_t CompactUnwindEncoding = 0;
   bool IsSignalFrame = false;
   bool IsSimple = false;
   unsigned RAReg = static_cast<unsigned>(INT_MAX);
diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp
index 87b062a16df1d2..776cc13ccd20b2 100644
--- a/llvm/lib/CodeGen/CFIInstrInserter.cpp
+++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp
@@ -68,9 +68,9 @@ class CFIInstrInserter : public MachineFunctionPass {
   struct MBBCFAInfo {
     MachineBasicBlock *MBB;
     /// Value of cfa offset valid at basic block entry.
-    int IncomingCFAOffset = -1;
+    int64_t IncomingCFAOffset = -1;
     /// Value of cfa offset valid at basic block exit.
-    int OutgoingCFAOffset = -1;
+    int64_t OutgoingCFAOffset = -1;
     /// Value of cfa register valid at basic block entry.
     unsigned IncomingCFARegister = 0;
     /// Value of cfa register valid at basic block exit.
@@ -120,7 +120,7 @@ class CFIInstrInserter : public MachineFunctionPass {
   /// Return the cfa offset value that should be set at the beginning of a MBB
   /// if needed. The negated value is needed when creating CFI instructions that
   /// set absolute offset.
-  int getCorrectCFAOffset(MachineBasicBlock *MBB) {
+  int64_t getCorrectCFAOffset(MachineBasicBlock *MBB) {
     return MBBVector[MBB->getNumber()].IncomingCFAOffset;
   }
 
@@ -175,7 +175,7 @@ void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) {
 
 void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
   // Outgoing cfa offset set by the block.
-  int SetOffset = MBBInfo.IncomingCFAOffset;
+  int64_t SetOffset = MBBInfo.IncomingCFAOffset;
   // Outgoing cfa register set by the block.
   unsigned SetRegister = MBBInfo.IncomingCFARegister;
   MachineFunction *MF = MBBInfo.MBB->getParent();
@@ -188,7 +188,7 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
   for (MachineInstr &MI : *MBBInfo.MBB) {
     if (MI.isCFIInstruction()) {
       std::optional<unsigned> CSRReg;
-      std::optional<int> CSROffset;
+      std::optional<int64_t> CSROffset;
       unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
       const MCCFIInstruction &CFI = Instrs[CFIIndex];
       switch (CFI.getOperation()) {
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index 853de4c88caeb7..e4b993850f73dc 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -197,7 +197,7 @@ void MachineFrameInfo::computeMaxCallFrameSize(
     for (MachineInstr &MI : MBB) {
       unsigned Opcode = MI.getOpcode();
       if (Opcode == FrameSetupOpcode || Opcode == FrameDestroyOpcode) {
-        unsigned Size = TII.getFrameSize(MI);
+        uint64_t Size = TII.getFrameSize(MI);
         MaxCallFrameSize = std::max(MaxCallFrameSize, Size);
         if (FrameSDOps != nullptr)
           FrameSDOps->push_back(&MI);
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index eaf96ec5cbde8c..9771825ed875b0 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -366,8 +366,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &MF) {
     return;
 
   // (Re-)Compute the MaxCallFrameSize.
-  [[maybe_unused]] uint32_t MaxCFSIn =
-      MFI.isMaxCallFrameSizeComputed() ? MFI.getMaxCallFrameSize() : UINT32_MAX;
+  [[maybe_unused]] uint64_t MaxCFSIn =
+      MFI.isMaxCallFrameSizeComputed() ? MFI.getMaxCallFrameSize() : UINT64_MAX;
   std::vector<MachineBasicBlock::iterator> FrameSDOps;
   MFI.computeMaxCallFrameSize(MF, &FrameSDOps);
   assert(MFI.getMaxCallFrameSize() <= MaxCFSIn &&
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 2ee0c3eb27b92e..9b8ec9bf2af0b9 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -1298,8 +1298,8 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
 namespace {
 
 class FrameEmitterImpl {
-  int CFAOffset = 0;
-  int InitialCFAOffset = 0;
+  int64_t CFAOffset = 0;
+  int64_t InitialCFAOffset = 0;
   bool IsEH;
   MCObjectStreamer &Streamer;
 
@@ -1413,7 +1413,7 @@ void FrameEmitterImpl::emitCFIInstruction(const MCCFIInstruction &Instr) {
     if (!IsEH)
       Reg = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg);
 
-    int Offset = Instr.getOffset();
+    int64_t Offset = Instr.getOffset();
     if (IsRelative)
       Offset -= CFAOffset;
     Offset = Offset / dataAlignmentFactor;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 30ef3680ae79c9..d83f7b5690eec6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -584,7 +584,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
   /// Encode compact unwind stack adjustment for frameless functions.
   /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
   /// The stack size always needs to be 16 byte aligned.
-  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+  uint64_t encodeStackAdjustment(uint64_t StackSize) const {
     return (StackSize / 16) << 12;
   }
 
@@ -602,7 +602,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
   }
 
   /// Generate the compact unwind encoding from the CFI directives.
-  uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                          const MCContext *Ctxt) const override {
     ArrayRef<MCCFIInstruction> Instrs = FI->Instructions;
     if (Instrs.empty())
@@ -612,10 +612,10 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
       return CU::UNWIND_ARM64_MODE_DWARF;
 
     bool HasFP = false;
-    unsigned StackSize = 0;
+    uint64_t StackSize = 0;
 
-    uint32_t CompactUnwindEncoding = 0;
-    int CurOffset = 0;
+    uint64_t CompactUnwindEncoding = 0;
+    int64_t CurOffset = 0;
     for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
       const MCCFIInstruction &Inst = Instrs[i];
 
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 9b54dd4e4e618d..a1012f3996e76b 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1165,7 +1165,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
         if (STI.splitFramePushPop(MF)) {
           unsigned DwarfReg = MRI->getDwarfRegNum(
               Reg == ARM::R12 ? ARM::RA_AUTH_CODE : Reg, true);
-          unsigned Offset = MFI.getObjectOffset(FI);
+          uint64_t Offset = MFI.getObjectOffset(FI);
           unsigned CFIIndex = MF.addFrameInst(
               MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
           BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1187,7 +1187,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
           (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
         unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-        unsigned Offset = MFI.getObjectOffset(FI);
+        uint64_t Offset = MFI.getObjectOffset(FI);
         unsigned CFIIndex = MF.addFrameInst(
             MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
         BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 6cd4badb7704b7..9671f69bfd2268 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1148,7 +1148,7 @@ enum CompactUnwindEncodings {
 /// instructions. If the CFI instructions describe a frame that cannot be
 /// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which
 /// tells the runtime to fallback and unwind using dwarf.
-uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
+uint64_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
     const MCDwarfFrameInfo *FI, const MCContext *Ctxt) const {
   DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n");
   // Only armv7k uses CFI based unwinding.
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index ac0c9b101cae13..9c958003ca756a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -34,7 +34,7 @@ class ARMAsmBackendDarwin : public ARMAsmBackend {
         /*Is64Bit=*/false, cantFail(MachO::getCPUType(TT)), Subtype);
   }
 
-  uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                          const MCContext *Ctxt) const override;
 };
 } // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 232651132d6e4f..394456c13e6812 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1660,7 +1660,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
   using SpillSlot = TargetFrameLowering::SpillSlot;
 
   unsigned NumFixed;
-  int MinOffset = 0;  // CS offsets are negative.
+  int64_t MinOffset = 0; // CS offsets are negative.
   const SpillSlot *FixedSlots = getCalleeSavedSpillSlots(NumFixed);
   for (const SpillSlot *S = FixedSlots; S != FixedSlots+NumFixed; ++S) {
     if (!SRegs[S->Reg])
@@ -1679,7 +1679,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
     Register R = x;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R);
     unsigned Size = TRI->getSpillSize(*RC);
-    int Off = MinOffset - Size;
+    int64_t Off = MinOffset - Size;
     Align Alignment = std::min(TRI->getSpillAlign(*RC), getStackAlign());
     Off &= -Alignment.value();
     int FI = MFI.CreateFixedSpillStackObject(Size, Off);
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index 176387d71fcb6c..6acbcf5cda2423 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -294,7 +294,7 @@ void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (!hasFP(MF)) {
     MBBI = FirstCSPop;
-    int64_t Offset = -CSSize - 2;
+    int64_t Offset = -(int64_t)CSSize - 2;
     // Mark callee-saved pop instruction.
     // Define the current CFA rule to use the provided offset.
     while (MBBI != MBB.end()) {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 99dc9797f6df92..23bff777df6e23 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -1328,7 +1328,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
 
   /// Implementation of algorithm to generate the compact unwind encoding
   /// for the CFI instructions.
-  uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                          const MCContext *Ctxt) const override {
     ArrayRef<MCCFIInstruction> Instrs = FI->Instructions;
     if (Instrs.empty()) return 0;
@@ -1343,13 +1343,13 @@ class DarwinX86AsmBackend : public X86AsmBackend {
     bool HasFP = false;
 
     // Encode that we are using EBP/RBP as the frame pointer.
-    uint32_t CompactUnwindEncoding = 0;
+    uint64_t CompactUnwindEncoding = 0;
 
     unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
     unsigned InstrOffset = 0;
     unsigned StackAdjust = 0;
-    unsigned StackSize = 0;
-    int MinAbsOffset = std::numeric_limits<int>::max();
+    uint64_t StackSize = 0;
+    int64_t MinAbsOffset = std::numeric_limits<int64_t>::max();
 
     for (const MCCFIInstruction &Inst : Instrs) {
       switch (Inst.getOperation()) {
@@ -1376,7 +1376,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
         memset(SavedRegs, 0, sizeof(SavedRegs));
         StackAdjust = 0;
         SavedRegIdx = 0;
-        MinAbsOffset = std::numeric_limits<int>::max();
+        MinAbsOffset = std::numeric_limits<int64_t>::max();
         InstrOffset += MoveInstrSize;
         break;
       }
@@ -1419,7 +1419,8 @@ class DarwinX86AsmBackend : public X86AsmBackend {
         unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
         SavedRegs[SavedRegIdx++] = Reg;
         StackAdjust += OffsetSize;
-        MinAbsOffset = std::min(MinAbsOffset, abs(Inst.getOffset()));
+        MinAbsOffset =
+            std::min<int64_t>(MinAbsOffset, std::abs(Inst.getOffset()));
         InstrOffset += PushInstrSize(Reg);
         break;
       }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 92a14226a0dc05..1df2b86349a214 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -358,7 +358,8 @@ class X86MCCodeEmitter : public MCCodeEmitter {
   void emitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize,
                      MCFixupKind FixupKind, uint64_t StartByte,
                      SmallVectorImpl<char> &CB,
-                     SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     int64_t ImmOffset = 0) const;
 
   void emitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
                         SmallVectorImpl<char> &CB) const;
@@ -412,7 +413,8 @@ static void emitConstant(uint64_t Val, unsigned Size,
 /// Determine if this immediate can fit in a disp8 or a compressed disp8 for
 /// EVEX instructions. \p will be set to the value to pass to the ImmOffset
 /// parameter of emitImmediate.
-static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) {
+static bool isDispOrCDisp8(uint64_t TSFlags, int64_t Value,
+                           int64_t &ImmOffset) {
   bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
 
   unsigned CD8_Scale =
@@ -425,7 +427,7 @@ static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) {
   if (Value & (CD8_Scale - 1)) // Unaligned offset
     return false;
 
-  int CDisp8 = Value / static_cast<int>(CD8_Scale);
+  int64_t CDisp8 = Value / static_cast<int64_t>(CD8_Scale);
   if (!isInt<8>(CDisp8))
     return false;
 
@@ -518,7 +520,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
                                      uint64_t StartByte,
                                      SmallVectorImpl<char> &CB,
                                      SmallVectorImpl<MCFixup> &Fixups,
-                                     int ImmOffset) const {
+                                     int64_t ImmOffset) const {
   const MCExpr *Expr = nullptr;
   if (DispOp.isImm()) {
     // If this is a simple integer displacement that doesn't require a
@@ -799,7 +801,7 @@ void X86MCCodeEmitter::emitMemModRMByte(
     // This also handles the 0 displacement for [EBP], [R13], [R21] or [R29]. We
     // can't use disp8 if the {disp32} pseudo prefix is present.
     if (Disp.isImm() && AllowDisp8) {
-      int ImmOffset = 0;
+      int64_t ImmOffset = 0;
       if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
         emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CB);
         emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, CB, Fixups,
@@ -826,7 +828,7 @@ void X86MCCodeEmitter::emitMemModRMByte(
 
   bool ForceDisp32 = false;
   bool ForceDisp8 = false;
-  int ImmOffset = 0;
+  int64_t ImmOffset = 0;
   if (BaseReg == 0) {
     // If there is no base register, we emit the special case SIB byte with
     // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index d914e1b61ab075..3e44ed621fdff4 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -380,9 +380,9 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
   return MI;
 }
 
-int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator &MBBI,
-                                     bool doMergeWithPrevious) const {
+int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator &MBBI,
+                                         bool doMergeWithPrevious) const {
   if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
       (!doMergeWithPrevious && MBBI == MBB.end()))
     return 0;
@@ -405,7 +405,7 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
     PI = std::prev(PI);
 
   unsigned Opc = PI->getOpcode();
-  int Offset = 0;
+  int64_t Offset = 0;
 
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
       PI->getOperand(0).getReg() == StackPtr) {
@@ -473,7 +473,7 @@ void X86FrameLowering::emitCalleeSavedFrameMovesFullCFA(
                                : FramePtr;
   unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
   // Offset = space for return address + size of the frame pointer itself.
-  unsigned Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
+  int64_t Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
   BuildCFI(MBB, MBBI, DebugLoc{},
            MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset));
   emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
@@ -1881,7 +1881,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
 
   // For EH funclets, only allocate enough space for outgoing calls. Save the
   // NumBytes value that we would've used for the parent frame.
-  unsigned ParentFrameNumBytes = NumBytes;
+  uint64_t ParentFrameNumBytes = NumBytes;
   if (IsFunclet)
     NumBytes = getWinEHFuncletFrameSize(MF);
 
@@ -2430,7 +2430,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   if (HasFP) {
     if (X86FI->hasSwiftAsyncContext()) {
       // Discard the context.
-      int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
+      int64_t Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
       emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
     }
     // Pop EBP.
@@ -2562,7 +2562,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (!HasFP && NeedsDwarfCFI) {
     MBBI = FirstCSPop;
-    int64_t Offset = -CSSize - SlotSize;
+    int64_t Offset = -(int64_t)CSSize - SlotSize;
     // Mark callee-saved pop instruction.
     // Define the current CFA rule to use the provided offset.
     while (MBBI != MBB.end()) {
@@ -2591,7 +2591,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
     // Add the return addr area delta back since we are not tail calling.
-    int Offset = -1 * X86FI->getTCReturnAddrDelta();
+    int64_t Offset = -1 * X86FI->getTCReturnAddrDelta();
     assert(Offset >= 0 && "TCDelta should never be positive");
     if (Offset) {
       // Check for possible merge with preceding ADD instruction.
@@ -2625,7 +2625,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
   // object.
   // We need to factor in additional offsets applied during the prologue to the
   // frame, base, and stack pointer depending on which is used.
-  int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+  int64_t Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
   const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t StackSize = MFI.getStackSize();
@@ -3919,7 +3919,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
   // FIXME: Don't set FrameSetup flag in catchret case.
 
   int FI = FuncInfo.EHRegNodeFrameIndex;
-  int EHRegSize = MFI.getObjectSize(FI);
+  int64_t EHRegSize = MFI.getObjectSize(FI);
 
   if (RestoreSP) {
     // MOV32rm -EHRegSize(%ebp), %esp
@@ -3929,8 +3929,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
   }
 
   Register UsedReg;
-  int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
-  int EndOffset = -EHRegOffset - EHRegSize;
+  int64_t EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
+  int64_t EndOffset = -EHRegOffset - EHRegSize;
   FuncInfo.EHRegNodeEndOffset = EndOffset;
 
   if (UsedReg == FramePtr) {
@@ -3951,7 +3951,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
         .setMIFlag(MachineInstr::FrameSetup);
     // MOV32rm SavedEBPOffset(%esi), %ebp
     assert(X86FI->getHasSEHFramePtrSave());
-    int Offset =
+    int64_t Offset =
         getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
             .getFixed();
     assert(UsedReg == BasePtr);
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 2dc9ecc6109d78..49580b31d39c7b 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -137,8 +137,9 @@ class X86FrameLowering : public TargetFrameLowering {
   /// it is an ADD/SUB/LEA instruction it is deleted argument and the
   /// stack adjustment is returned as a positive value for ADD/LEA and
   /// a negative for SUB.
-  int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-                     bool doMergeWithPrevious) const;
+  int64_t mergeSPUpdates(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI,
+                         bool doMergeWithPrevious) const;
 
   /// Emit a series of instructions to increment / decrement the stack
   /// pointer by a constant value.
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index be0cf1596d0d90..57f645462089ed 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -893,7 +893,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
   // Determine base register and offset.
-  int FIOffset;
+  int64_t FIOffset;
   Register BasePtr;
   if (MI.isReturn()) {
     assert((!hasStackRealignment(MF) ||
@@ -946,9 +946,11 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (MI.getOperand(FIOperandNum+3).isImm()) {
     // Offset is a 32-bit integer.
     int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
-    int Offset = FIOffset + Imm;
-    assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
-           "Requesting 64-bit offset in 32-bit immediate!");
+    int64_t Offset = FIOffset + Imm;
+    if (!Is64Bit) {
+      assert(isInt<32>((long long)FIOffset + Imm) &&
+             "Requesting 64-bit offset in 32-bit immediate!");
+    }
     if (Offset != 0 || !tryOptimizeLEAtoMOV(II))
       MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
   } else {
diff --git a/llvm/test/CodeGen/PowerPC/huge-frame-size.ll b/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
index f1039df6f549ae..78bdac021ac8af 100644
--- a/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
+++ b/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
@@ -18,7 +18,7 @@ define void @foo(i8 %x) {
 ; CHECK-LE-NEXT:    oris 0, 0, 65535
 ; CHECK-LE-NEXT:    ori 0, 0, 65504
 ; CHECK-LE-NEXT:    stdux 1, 1, 0
-; CHECK-LE-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-LE-NEXT:    .cfi_def_cfa_offset 4294967328
 ; CHECK-LE-NEXT:    li 4, 1
 ; CHECK-LE-NEXT:    addi 5, 1, 32
 ; CHECK-LE-NEXT:    stb 3, 32(1)
diff --git a/llvm/test/CodeGen/X86/huge-stack.ll b/llvm/test/CodeGen/X86/huge-stack.ll
new file mode 100644
index 00000000000000..4596c50382a08e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/huge-stack.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 4
+; RUN: llc -O0 -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+%large = type [4294967295 x i8]
+
+define void @foo() unnamed_addr #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $8589934462, %rax # imm = 0x1FFFFFF7E
+; CHECK-NEXT:    subq %rax, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8589934470
+; CHECK-NEXT:    movb $42, 4294967167(%rsp)
+; CHECK-NEXT:    movb $43, -128(%rsp)
+; CHECK-NEXT:    movabsq $8589934462, %rax # imm = 0x1FFFFFF7E
+; CHECK-NEXT:    addq %rax, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %1 = alloca %large, align 1
+  %2 = alloca %large, align 1
+  %3 = getelementptr inbounds %large, ptr %1, i64 0, i64 0
+  store i8 42, ptr %3, align 1
+  %4 = getelementptr inbounds %large, ptr %2, i64 0, i64 0
+  store i8 43, ptr %4, align 1
+  ret void
+}

>From 13b653ab112736b92cd7f8ef249ced2b148ee7f4 Mon Sep 17 00:00:00 2001
From: Brandon Wu <brandon.wu at sifive.com>
Date: Wed, 27 Mar 2024 23:22:01 +0800
Subject: [PATCH 33/54] [clang][RISCV] Enable RVV with function attribute
 __attribute__((target("arch=+v"))) (#83674)

It is currently not possible to use "RVV type" and "RVV intrinsics" if
the "zve32x" is not enabled globally. However in some cases we may want
to use them only in some functions, for instance:
```
#include <riscv_vector.h>

__attribute__((target("+zve32x")))
vint32m1_t rvv_add(vint32m1_t v1, vint32m1_t v2, size_t vl) {
  return __riscv_vadd(v1, v2, vl);
}

int other_add(int i1, int i2) {
  return i1 + i2;
}
```
, it is supposed to be compilable even the vector is not specified, e.g.
`clang -target riscv64 -march=rv64gc -S test.c`.
---
 clang/include/clang/Sema/Sema.h               |  3 +-
 clang/lib/Sema/Sema.cpp                       |  7 +-
 clang/lib/Sema/SemaChecking.cpp               | 70 +++----------------
 clang/lib/Sema/SemaDecl.cpp                   |  9 ++-
 .../RISCV/riscv-func-attr-target-err.c        | 22 ++++++
 .../CodeGen/RISCV/riscv-func-attr-target.c    | 33 +++++++++
 .../RISCV/rvb-intrinsics/riscv32-zbb-error.c  |  4 +-
 .../RISCV/rvb-intrinsics/riscv64-zbkb-error.c | 12 ++--
 .../rvv-intrinsics-handcrafted/rvv-error.c    |  2 +-
 clang/utils/TableGen/RISCVVEmitter.cpp        |  4 --
 10 files changed, 85 insertions(+), 81 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5ecd2f9eb2881f..3a1abd4c7892b8 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2234,7 +2234,8 @@ class Sema final {
   bool CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum);
   bool CheckRISCVBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                      CallExpr *TheCall);
-  void checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D);
+  void checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
+                           const llvm::StringMap<bool> &FeatureMap);
   bool CheckLoongArchBuiltinFunctionCall(const TargetInfo &TI,
                                          unsigned BuiltinID, CallExpr *TheCall);
   bool CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index b55f433a8be76f..72393bea620526 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -2065,8 +2065,11 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
         targetDiag(D->getLocation(), diag::note_defined_here, FD) << D;
     }
 
-    if (TI.hasRISCVVTypes() && Ty->isRVVSizelessBuiltinType())
-      checkRVVTypeSupport(Ty, Loc, D);
+    if (TI.hasRISCVVTypes() && Ty->isRVVSizelessBuiltinType() && FD) {
+      llvm::StringMap<bool> CallerFeatureMap;
+      Context.getFunctionFeatureMap(CallerFeatureMap, FD);
+      checkRVVTypeSupport(Ty, Loc, D, CallerFeatureMap);
+    }
 
     // Don't allow SVE types in functions without a SVE target.
     if (Ty->isSVESizelessBuiltinType() && FD && FD->hasBody()) {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 08449581330934..447e73686b4f3f 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5760,57 +5760,6 @@ static bool CheckInvalidVLENandLMUL(const TargetInfo &TI, CallExpr *TheCall,
 bool Sema::CheckRISCVBuiltinFunctionCall(const TargetInfo &TI,
                                          unsigned BuiltinID,
                                          CallExpr *TheCall) {
-  // CodeGenFunction can also detect this, but this gives a better error
-  // message.
-  bool FeatureMissing = false;
-  SmallVector<StringRef> ReqFeatures;
-  StringRef Features = Context.BuiltinInfo.getRequiredFeatures(BuiltinID);
-  Features.split(ReqFeatures, ',', -1, false);
-
-  // Check if each required feature is included
-  for (StringRef F : ReqFeatures) {
-    SmallVector<StringRef> ReqOpFeatures;
-    F.split(ReqOpFeatures, '|');
-
-    if (llvm::none_of(ReqOpFeatures,
-                      [&TI](StringRef OF) { return TI.hasFeature(OF); })) {
-      std::string FeatureStrs;
-      bool IsExtension = true;
-      for (StringRef OF : ReqOpFeatures) {
-        // If the feature is 64bit, alter the string so it will print better in
-        // the diagnostic.
-        if (OF == "64bit") {
-          assert(ReqOpFeatures.size() == 1 && "Expected '64bit' to be alone");
-          OF = "RV64";
-          IsExtension = false;
-        }
-        if (OF == "32bit") {
-          assert(ReqOpFeatures.size() == 1 && "Expected '32bit' to be alone");
-          OF = "RV32";
-          IsExtension = false;
-        }
-
-        // Convert features like "zbr" and "experimental-zbr" to "Zbr".
-        OF.consume_front("experimental-");
-        std::string FeatureStr = OF.str();
-        FeatureStr[0] = std::toupper(FeatureStr[0]);
-        // Combine strings.
-        FeatureStrs += FeatureStrs.empty() ? "" : ", ";
-        FeatureStrs += "'";
-        FeatureStrs += FeatureStr;
-        FeatureStrs += "'";
-      }
-      // Error message
-      FeatureMissing = true;
-      Diag(TheCall->getBeginLoc(), diag::err_riscv_builtin_requires_extension)
-          << IsExtension
-          << TheCall->getSourceRange() << StringRef(FeatureStrs);
-    }
-  }
-
-  if (FeatureMissing)
-    return true;
-
   // vmulh.vv, vmulh.vx, vmulhu.vv, vmulhu.vx, vmulhsu.vv, vmulhsu.vx,
   // vsmul.vv, vsmul.vx are not included for EEW=64 in Zve64*.
   switch (BuiltinID) {
@@ -6714,36 +6663,35 @@ bool Sema::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
   return false;
 }
 
-void Sema::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D) {
-  const TargetInfo &TI = Context.getTargetInfo();
-
+void Sema::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
+                               const llvm::StringMap<bool> &FeatureMap) {
   ASTContext::BuiltinVectorTypeInfo Info =
       Context.getBuiltinVectorTypeInfo(Ty->castAs<BuiltinType>());
   unsigned EltSize = Context.getTypeSize(Info.ElementType);
   unsigned MinElts = Info.EC.getKnownMinValue();
 
   if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Double) &&
-      !TI.hasFeature("zve64d"))
+      !FeatureMap.lookup("zve64d"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64d";
   // (ELEN, LMUL) pairs of (8, mf8), (16, mf4), (32, mf2), (64, m1) requires at
   // least zve64x
   else if (((EltSize == 64 && Info.ElementType->isIntegerType()) ||
             MinElts == 1) &&
-           !TI.hasFeature("zve64x"))
+           !FeatureMap.lookup("zve64x"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64x";
-  else if (Info.ElementType->isFloat16Type() && !TI.hasFeature("zvfh") &&
-           !TI.hasFeature("zvfhmin"))
+  else if (Info.ElementType->isFloat16Type() && !FeatureMap.lookup("zvfh") &&
+           !FeatureMap.lookup("zvfhmin"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D)
         << Ty << "zvfh or zvfhmin";
   else if (Info.ElementType->isBFloat16Type() &&
-           !TI.hasFeature("experimental-zvfbfmin"))
+           !FeatureMap.lookup("experimental-zvfbfmin"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zvfbfmin";
   else if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Float) &&
-           !TI.hasFeature("zve32f"))
+           !FeatureMap.lookup("zve32f"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32f";
   // Given that caller already checked isRVVType() before calling this function,
   // if we don't have at least zve32x supported, then we need to emit error.
-  else if (!TI.hasFeature("zve32x"))
+  else if (!FeatureMap.lookup("zve32x"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32x";
 }
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 66aad2592cb383..8b44d24f5273aa 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8962,8 +8962,13 @@ void Sema::CheckVariableDeclarationType(VarDecl *NewVD) {
     }
   }
 
-  if (T->isRVVSizelessBuiltinType())
-    checkRVVTypeSupport(T, NewVD->getLocation(), cast<Decl>(CurContext));
+  if (T->isRVVSizelessBuiltinType() && isa<FunctionDecl>(CurContext)) {
+    const FunctionDecl *FD = cast<FunctionDecl>(CurContext);
+    llvm::StringMap<bool> CallerFeatureMap;
+    Context.getFunctionFeatureMap(CallerFeatureMap, FD);
+    checkRVVTypeSupport(T, NewVD->getLocation(), cast<Decl>(CurContext),
+                        CallerFeatureMap);
+  }
 }
 
 /// Perform semantic checking on a newly-created variable
diff --git a/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c b/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c
index 35d6973818d01c..b303d71304bf3e 100644
--- a/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c
+++ b/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c
@@ -2,6 +2,28 @@
 // RUN: not %clang_cc1 -triple riscv64 -target-feature +zifencei -target-feature +m -target-feature +a \
 // RUN:  -emit-llvm %s 2>&1 | FileCheck %s
 
+#include <riscv_vector.h>
+
+void test_builtin() {
+// CHECK: error: '__builtin_rvv_vsetvli' needs target feature zve32x
+  __riscv_vsetvl_e8m8(1);
+}
+
+void test_rvv_i32_type() {
+// CHECK: error: RISC-V type 'vint32m1_t' (aka '__rvv_int32m1_t') requires the 'zve32x' extension
+  vint32m1_t v;
+}
+
+void test_rvv_f32_type() {
+// CHECK: error: RISC-V type 'vfloat32m1_t' (aka '__rvv_float32m1_t') requires the 'zve32f' extension
+  vfloat32m1_t v;
+}
+
+void test_rvv_f64_type() {
+// CHECK: error: RISC-V type 'vfloat64m1_t' (aka '__rvv_float64m1_t') requires the 'zve64d' extension
+  vfloat64m1_t v;
+}
+
 // CHECK: error: duplicate 'arch=' in the 'target' attribute string;
 __attribute__((target("arch=rv64gc;arch=rv64gc_zbb"))) void testMultiArchSelectLast() {}
 // CHECK: error: duplicate 'cpu=' in the 'target' attribute string;
diff --git a/clang/test/CodeGen/RISCV/riscv-func-attr-target.c b/clang/test/CodeGen/RISCV/riscv-func-attr-target.c
index f216eaf735b4a8..1f8682179ea813 100644
--- a/clang/test/CodeGen/RISCV/riscv-func-attr-target.c
+++ b/clang/test/CodeGen/RISCV/riscv-func-attr-target.c
@@ -4,6 +4,8 @@
 // RUN:  -target-feature -relax -target-feature -zfa \
 // RUN:  -emit-llvm %s -o - | FileCheck %s
 
+#include <riscv_vector.h>
+
 // CHECK-LABEL: define dso_local void @testDefault
 // CHECK-SAME: () #0 {
 void testDefault() {}
@@ -35,6 +37,34 @@ testAttrFullArchAndAttrCpu() {}
 // CHECK-SAME: () #8 {
 __attribute__((target("cpu=sifive-u54"))) void testAttrCpuOnly() {}
 
+__attribute__((target("arch=+zve32x")))
+void test_builtin_w_zve32x() {
+// CHECK-LABEL: test_builtin_w_zve32x
+// CHECK-SAME: #9
+  __riscv_vsetvl_e8m8(1);
+}
+
+__attribute__((target("arch=+zve32x")))
+void test_rvv_i32_type_w_zve32x() {
+// CHECK-LABEL: test_rvv_i32_type_w_zve32x
+// CHECK-SAME: #9
+  vint32m1_t v;
+}
+
+__attribute__((target("arch=+zve32f")))
+void test_rvv_f32_type_w_zve32f() {
+// CHECK-LABEL: test_rvv_f32_type_w_zve32f
+// CHECK-SAME: #11
+  vfloat32m1_t v;
+}
+
+__attribute__((target("arch=+zve64d")))
+void test_rvv_f64_type_w_zve64d() {
+// CHECK-LABEL: test_rvv_f64_type_w_zve64d
+// CHECK-SAME: #12
+  vfloat64m1_t v;
+}
+
 //.
 // CHECK: attributes #0 = { {{.*}}"target-features"="+64bit,+a,+m,+save-restore,+zifencei,-relax,-zbb,-zfa" }
 // CHECK: attributes #1 = { {{.*}}"target-cpu"="rocket-rv64" "target-features"="+64bit,+a,+d,+f,+m,+save-restore,+v,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-relax,-zbb,-zfa" "tune-cpu"="generic-rv64" }
@@ -46,3 +76,6 @@ __attribute__((target("cpu=sifive-u54"))) void testAttrCpuOnly() {}
 // CHECK: attributes #6 = { {{.*}}"target-cpu"="sifive-u54" "target-features"="+64bit,+a,+m,+save-restore,+zbb,+zifencei,-relax,-zfa" }
 // CHECK: attributes #7 = { {{.*}}"target-cpu"="sifive-u54" "target-features"="+64bit,+m,+save-restore,{{(-[[:alnum:]-]+)(,-[[:alnum:]-]+)*}}" }
 // CHECK: attributes #8 = { {{.*}}"target-cpu"="sifive-u54" "target-features"="+64bit,+a,+c,+d,+f,+m,+save-restore,+zicsr,+zifencei,{{(-[[:alnum:]-]+)(,-[[:alnum:]-]+)*}}" }
+// CHECK: attributes #9 = { {{.*}}"target-features"="+64bit,+a,+m,+save-restore,+zicsr,+zifencei,+zve32x,+zvl32b,-relax,-zbb,-zfa" }
+// CHECK: attributes #11 = { {{.*}}"target-features"="+64bit,+a,+f,+m,+save-restore,+zicsr,+zifencei,+zve32f,+zve32x,+zvl32b,-relax,-zbb,-zfa" }
+// CHECK: attributes #12 = { {{.*}}"target-features"="+64bit,+a,+d,+f,+m,+save-restore,+zicsr,+zifencei,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl32b,+zvl64b,-relax,-zbb,-zfa" }
diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv32-zbb-error.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv32-zbb-error.c
index ecf090a128aac7..bad68504fab055 100644
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv32-zbb-error.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv32-zbb-error.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv32 -target-feature +zbb -verify %s -o -
+// RUN: %clang_cc1 -triple riscv32 -target-feature +zbb -S -verify %s -o -
 
 unsigned int orc_b_64(unsigned int a) {
-  return __builtin_riscv_orc_b_64(a); // expected-error {{builtin requires: 'RV64'}}
+  return __builtin_riscv_orc_b_64(a); // expected-error {{'__builtin_riscv_orc_b_64' needs target feature zbb,64bit}}
 }
diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbkb-error.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbkb-error.c
index d2e3e76043aef1..a256bf75b031c6 100644
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbkb-error.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/riscv64-zbkb-error.c
@@ -1,14 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv64 -target-feature +zbkb -verify %s -o -
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zbkb -S -verify %s -o -
 
 #include <stdint.h>
 
-uint32_t zip(uint32_t rs1)
+uint32_t zip_unzip(uint32_t rs1)
 {
-  return __builtin_riscv_zip_32(rs1); // expected-error {{builtin requires: 'RV32'}}
-}
-
-uint32_t unzip(uint32_t rs1)
-{
-  return __builtin_riscv_unzip_32(rs1); // expected-error {{builtin requires: 'RV32'}}
+  (void)__builtin_riscv_zip_32(rs1); // expected-error {{'__builtin_riscv_zip_32' needs target feature zbkb,32bit}}
+  return __builtin_riscv_unzip_32(rs1); // expected-error {{'__builtin_riscv_unzip_32' needs target feature zbkb,32bit}}
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-error.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-error.c
index 6ec9b057997690..ecb6c5f2702577 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-error.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-error.c
@@ -11,7 +11,7 @@
 // CHECK-RV64V-NEXT:    ret i32 [[CONV]]
 //
 
-// CHECK-RV64-ERR: error: builtin requires at least one of the following extensions: 'Zve32x'
+// CHECK-RV64-ERR: error: '__builtin_rvv_vsetvli' needs target feature zve32x
 
 int test() {
   return __builtin_rvv_vsetvli(1, 0, 0);
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index 8513174c88bfc3..5e41ef9f9d2684 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -334,10 +334,6 @@ void RVVEmitter::createHeader(raw_ostream &OS) {
   OS << "#include <stdint.h>\n";
   OS << "#include <stddef.h>\n\n";
 
-  OS << "#ifndef __riscv_vector\n";
-  OS << "#error \"Vector intrinsics require the vector extension.\"\n";
-  OS << "#endif\n\n";
-
   OS << "#ifdef __cplusplus\n";
   OS << "extern \"C\" {\n";
   OS << "#endif\n\n";

>From 51f7b262425959d4e2bd6bc79fed283586d0472e Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers at users.noreply.github.com>
Date: Wed, 27 Mar 2024 08:22:51 -0700
Subject: [PATCH 34/54] [libc][support][UInt] implement 128b math helpers
 (#86531)

Flush out the remaining UInt<128> support and add test coverage.

We could have used cpp::popcount in the implementation of
UInt::has_single_bit, but
has_single_bit has a perhaps useful early return.
---
 libc/src/__support/CPP/bit.h                 | 10 +++-
 libc/src/__support/UInt.h                    | 54 ++++++++++++++++++++
 libc/test/src/__support/CPP/bit_test.cpp     |  9 +---
 libc/test/src/__support/math_extras_test.cpp |  4 +-
 4 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
index 3f2fbec944054c..526c499adc374c 100644
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -242,6 +242,14 @@ LIBC_INLINE constexpr To bit_or_static_cast(const From &from) {
 /// Count number of 1's aka population count or Hamming weight.
 ///
 /// Only unsigned integral types are allowed.
+// clang-19+, gcc-14+
+#if __has_builtin(__builtin_popcountg)
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
+popcount(T value) {
+  return __builtin_popcountg(value);
+}
+#else // !__has_builtin(__builtin_popcountg)
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
 popcount(T value) {
@@ -261,7 +269,7 @@ ADD_SPECIALIZATION(unsigned short, __builtin_popcount)
 ADD_SPECIALIZATION(unsigned, __builtin_popcount)
 ADD_SPECIALIZATION(unsigned long, __builtin_popcountl)
 ADD_SPECIALIZATION(unsigned long long, __builtin_popcountll)
-// TODO: 128b specializations?
+#endif // __builtin_popcountg
 #undef ADD_SPECIALIZATION
 
 } // namespace LIBC_NAMESPACE::cpp
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index df01e081e3c19e..282efdba1c5f2b 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -1082,6 +1082,17 @@ bit_cast(const UInt<Bits> &from) {
   return cpp::bit_cast<To>(from.val);
 }
 
+// Specialization of cpp::popcount ('bit.h') for BigInt.
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+popcount(T value) {
+  int bits = 0;
+  for (auto word : value.val)
+    if (word)
+      bits += popcount(word);
+  return bits;
+}
+
 // Specialization of cpp::has_single_bit ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, bool>
@@ -1218,6 +1229,49 @@ LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T> mask_leading_ones() {
   return out;
 }
 
+// Specialization of count_zeros ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+count_zeros(T value) {
+  return cpp::popcount(~value);
+}
+
+// Specialization of first_leading_zero ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+first_leading_zero(T value) {
+  return value == cpp::numeric_limits<T>::max() ? 0
+                                                : cpp::countl_one(value) + 1;
+}
+
+// Specialization of first_leading_one ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+first_leading_one(T value) {
+  return first_leading_zero(~value);
+}
+
+// Specialization of first_trailing_zero ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+first_trailing_zero(T value) {
+  return value == cpp::numeric_limits<T>::max() ? 0
+                                                : cpp::countr_zero(~value) + 1;
+}
+
+// Specialization of first_trailing_one ('math_extras.h') for BigInt.
+template <typename T>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+first_trailing_one(T value) {
+  return value == cpp::numeric_limits<T>::max() ? 0
+                                                : cpp::countr_zero(value) + 1;
+}
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_UINT_H
diff --git a/libc/test/src/__support/CPP/bit_test.cpp b/libc/test/src/__support/CPP/bit_test.cpp
index cee5b90c8f4bdb..875b47e6a1980e 100644
--- a/libc/test/src/__support/CPP/bit_test.cpp
+++ b/libc/test/src/__support/CPP/bit_test.cpp
@@ -15,13 +15,6 @@
 
 namespace LIBC_NAMESPACE::cpp {
 
-using UnsignedTypesNoBigInt = testing::TypeList<
-#if defined(LIBC_TYPES_HAS_INT128)
-    __uint128_t,
-#endif // LIBC_TYPES_HAS_INT128
-    unsigned char, unsigned short, unsigned int, unsigned long,
-    unsigned long long>;
-
 using UnsignedTypes = testing::TypeList<
 #if defined(LIBC_TYPES_HAS_INT128)
     __uint128_t,
@@ -228,7 +221,7 @@ TEST(LlvmLibcBitTest, Rotr) {
             rotr<uint64_t>(0x12345678deadbeefULL, -19));
 }
 
-TYPED_TEST(LlvmLibcBitTest, CountOnes, UnsignedTypesNoBigInt) {
+TYPED_TEST(LlvmLibcBitTest, CountOnes, UnsignedTypes) {
   EXPECT_EQ(popcount(T(0)), 0);
   for (int i = 0; i != cpp::numeric_limits<T>::digits; ++i)
     EXPECT_EQ(popcount<T>(cpp::numeric_limits<T>::max() >> i),
diff --git a/libc/test/src/__support/math_extras_test.cpp b/libc/test/src/__support/math_extras_test.cpp
index e642248881a41c..e88b3e1d6b687b 100644
--- a/libc/test/src/__support/math_extras_test.cpp
+++ b/libc/test/src/__support/math_extras_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/UInt128.h" // UInt128
+#include "src/__support/UInt128.h" // UInt<128>
 #include "src/__support/integer_literals.h"
 #include "src/__support/math_extras.h"
 #include "test/UnitTest/Test.h"
@@ -19,7 +19,7 @@ using UnsignedTypesNoBigInt = testing::TypeList<
     __uint128_t,
 #endif // LIBC_TYPES_HAS_INT128
     unsigned char, unsigned short, unsigned int, unsigned long,
-    unsigned long long>;
+    unsigned long long, UInt<128>>;
 
 TEST(LlvmLibcBlockMathExtrasTest, mask_trailing_ones) {
   EXPECT_EQ(0_u8, (mask_leading_ones<uint8_t, 0>()));

>From aa2c14de1adcd265bf0c0fb44f97b5d6c1c38710 Mon Sep 17 00:00:00 2001
From: Terry Wilmarth <terry.l.wilmarth at intel.com>
Date: Wed, 27 Mar 2024 11:27:28 -0400
Subject: [PATCH 35/54] [OpenMP] Close up permissions on /tmp files (#85469)

The SHM or /tmp files that might be created during library registration
don't need to have such open permissions, so this change fixes that.
---
 openmp/runtime/src/kmp_runtime.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index a60bdb968371e0..a242049286a74f 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -6752,11 +6752,11 @@ void __kmp_register_library_startup(void) {
       int fd1 = -1;
       shm_name = __kmp_str_format("/%s", name);
       int shm_preexist = 0;
-      fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+      fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
       if ((fd1 == -1) && (errno == EEXIST)) {
         // file didn't open because it already exists.
         // try opening existing file
-        fd1 = shm_open(shm_name, O_RDWR, 0666);
+        fd1 = shm_open(shm_name, O_RDWR, 0600);
         if (fd1 == -1) { // file didn't open
           KMP_WARNING(FunctionError, "Can't open SHM");
           __kmp_shm_available = false;
@@ -6800,11 +6800,11 @@ void __kmp_register_library_startup(void) {
       int fd1 = -1;
       temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
       int tmp_preexist = 0;
-      fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+      fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
       if ((fd1 == -1) && (errno == EEXIST)) {
         // file didn't open because it already exists.
         // try opening existing file
-        fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
+        fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
         if (fd1 == -1) { // file didn't open if (fd1 == -1) {
           KMP_WARNING(FunctionError, "Can't open TEMP");
           __kmp_tmp_available = false;
@@ -6944,7 +6944,7 @@ void __kmp_unregister_library(void) {
   int fd1;
   if (__kmp_shm_available) {
     shm_name = __kmp_str_format("/%s", name);
-    fd1 = shm_open(shm_name, O_RDONLY, 0666);
+    fd1 = shm_open(shm_name, O_RDONLY, 0600);
     if (fd1 != -1) { // File opened successfully
       char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
       if (data1 != MAP_FAILED) {

>From 009f88fc0e3a036be97ef7b222b90af342bae0b7 Mon Sep 17 00:00:00 2001
From: Yusra Syeda <99052248+ysyeda at users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:31:21 -0400
Subject: [PATCH 36/54] [SystemZ][z/OS] TXT records in the GOFF reader (#74526)

This PR adds handling for TXT records in the GOFF reader.

---------

Authored-by: Yusra Syeda <yusra.syeda at ibm.com>
---
 llvm/include/llvm/Object/GOFF.h              |  20 +++
 llvm/include/llvm/Object/GOFFObjectFile.h    |  56 +++++--
 llvm/lib/Object/GOFFObjectFile.cpp           | 167 +++++++++++++++++++
 llvm/unittests/Object/GOFFObjectFileTest.cpp |  97 +++++++++++
 4 files changed, 326 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/Object/GOFF.h b/llvm/include/llvm/Object/GOFF.h
index 91762457ae0563..9fb8876e893d57 100644
--- a/llvm/include/llvm/Object/GOFF.h
+++ b/llvm/include/llvm/Object/GOFF.h
@@ -73,6 +73,26 @@ class Record {
   }
 };
 
+class TXTRecord : public Record {
+public:
+  /// \brief Maximum length of data; any more must go in continuation.
+  static const uint8_t TXTMaxDataLength = 56;
+
+  static Error getData(const uint8_t *Record, SmallString<256> &CompleteData);
+
+  static void getElementEsdId(const uint8_t *Record, uint32_t &EsdId) {
+    get<uint32_t>(Record, 4, EsdId);
+  }
+
+  static void getOffset(const uint8_t *Record, uint32_t &Offset) {
+    get<uint32_t>(Record, 12, Offset);
+  }
+
+  static void getDataLength(const uint8_t *Record, uint16_t &Length) {
+    get<uint16_t>(Record, 22, Length);
+  }
+};
+
 class HDRRecord : public Record {
 public:
   static Error getData(const uint8_t *Record, SmallString<256> &CompleteData);
diff --git a/llvm/include/llvm/Object/GOFFObjectFile.h b/llvm/include/llvm/Object/GOFFObjectFile.h
index 7e1ceb95f66723..6871641e97ec8d 100644
--- a/llvm/include/llvm/Object/GOFFObjectFile.h
+++ b/llvm/include/llvm/Object/GOFFObjectFile.h
@@ -29,7 +29,10 @@ namespace llvm {
 namespace object {
 
 class GOFFObjectFile : public ObjectFile {
+  friend class GOFFSymbolRef;
+
   IndexedMap<const uint8_t *> EsdPtrs; // Indexed by EsdId.
+  SmallVector<const uint8_t *, 256> TextPtrs;
 
   mutable DenseMap<uint32_t, std::pair<size_t, std::unique_ptr<char[]>>>
       EsdNamesCache;
@@ -38,7 +41,7 @@ class GOFFObjectFile : public ObjectFile {
   // (EDID, 0)               code, r/o data section
   // (EDID,PRID)             r/w data section
   SmallVector<SectionEntryImpl, 256> SectionList;
-  mutable DenseMap<uint32_t, std::string> SectionDataCache;
+  mutable DenseMap<uint32_t, SmallVector<uint8_t>> SectionDataCache;
 
 public:
   Expected<StringRef> getSymbolName(SymbolRef Symbol) const;
@@ -66,6 +69,10 @@ class GOFFObjectFile : public ObjectFile {
     return true;
   }
 
+  bool isSectionNoLoad(DataRefImpl Sec) const;
+  bool isSectionReadOnlyData(DataRefImpl Sec) const;
+  bool isSectionZeroInit(DataRefImpl Sec) const;
+
 private:
   // SymbolRef.
   Expected<StringRef> getSymbolName(DataRefImpl Symb) const override;
@@ -75,27 +82,24 @@ class GOFFObjectFile : public ObjectFile {
   Expected<uint32_t> getSymbolFlags(DataRefImpl Symb) const override;
   Expected<SymbolRef::Type> getSymbolType(DataRefImpl Symb) const override;
   Expected<section_iterator> getSymbolSection(DataRefImpl Symb) const override;
+  uint64_t getSymbolSize(DataRefImpl Symb) const;
 
   const uint8_t *getSymbolEsdRecord(DataRefImpl Symb) const;
   bool isSymbolUnresolved(DataRefImpl Symb) const;
   bool isSymbolIndirect(DataRefImpl Symb) const;
 
   // SectionRef.
-  void moveSectionNext(DataRefImpl &Sec) const override {}
-  virtual Expected<StringRef> getSectionName(DataRefImpl Sec) const override {
-    return StringRef();
-  }
-  uint64_t getSectionAddress(DataRefImpl Sec) const override { return 0; }
-  uint64_t getSectionSize(DataRefImpl Sec) const override { return 0; }
+  void moveSectionNext(DataRefImpl &Sec) const override;
+  virtual Expected<StringRef> getSectionName(DataRefImpl Sec) const override;
+  uint64_t getSectionAddress(DataRefImpl Sec) const override;
+  uint64_t getSectionSize(DataRefImpl Sec) const override;
   virtual Expected<ArrayRef<uint8_t>>
-  getSectionContents(DataRefImpl Sec) const override {
-    return ArrayRef<uint8_t>();
-  }
-  uint64_t getSectionIndex(DataRefImpl Sec) const override { return 0; }
-  uint64_t getSectionAlignment(DataRefImpl Sec) const override { return 0; }
+  getSectionContents(DataRefImpl Sec) const override;
+  uint64_t getSectionIndex(DataRefImpl Sec) const override { return Sec.d.a; }
+  uint64_t getSectionAlignment(DataRefImpl Sec) const override;
   bool isSectionCompressed(DataRefImpl Sec) const override { return false; }
-  bool isSectionText(DataRefImpl Sec) const override { return false; }
-  bool isSectionData(DataRefImpl Sec) const override { return false; }
+  bool isSectionText(DataRefImpl Sec) const override;
+  bool isSectionData(DataRefImpl Sec) const override;
   bool isSectionBSS(DataRefImpl Sec) const override { return false; }
   bool isSectionVirtual(DataRefImpl Sec) const override { return false; }
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override {
@@ -109,6 +113,7 @@ class GOFFObjectFile : public ObjectFile {
   const uint8_t *getSectionPrEsdRecord(DataRefImpl &Sec) const;
   const uint8_t *getSectionEdEsdRecord(uint32_t SectionIndex) const;
   const uint8_t *getSectionPrEsdRecord(uint32_t SectionIndex) const;
+  uint32_t getSectionDefEsdId(DataRefImpl &Sec) const;
 
   // RelocationRef.
   void moveRelocationNext(DataRefImpl &Rel) const override {}
@@ -122,6 +127,29 @@ class GOFFObjectFile : public ObjectFile {
                              SmallVectorImpl<char> &Result) const override {}
 };
 
+class GOFFSymbolRef : public SymbolRef {
+public:
+  GOFFSymbolRef(const SymbolRef &B) : SymbolRef(B) {
+    assert(isa<GOFFObjectFile>(SymbolRef::getObject()));
+  }
+
+  const GOFFObjectFile *getObject() const {
+    return cast<GOFFObjectFile>(BasicSymbolRef::getObject());
+  }
+
+  Expected<uint32_t> getSymbolGOFFFlags() const {
+    return getObject()->getSymbolFlags(getRawDataRefImpl());
+  }
+
+  Expected<SymbolRef::Type> getSymbolGOFFType() const {
+    return getObject()->getSymbolType(getRawDataRefImpl());
+  }
+
+  uint64_t getSize() const {
+    return getObject()->getSymbolSize(getRawDataRefImpl());
+  }
+};
+
 } // namespace object
 
 } // namespace llvm
diff --git a/llvm/lib/Object/GOFFObjectFile.cpp b/llvm/lib/Object/GOFFObjectFile.cpp
index 76a13559ebfe35..6b48d464dc3ec7 100644
--- a/llvm/lib/Object/GOFFObjectFile.cpp
+++ b/llvm/lib/Object/GOFFObjectFile.cpp
@@ -168,6 +168,11 @@ GOFFObjectFile::GOFFObjectFile(MemoryBufferRef Object, Error &Err)
       LLVM_DEBUG(dbgs() << "  --  ESD " << EsdId << "\n");
       break;
     }
+    case GOFF::RT_TXT:
+      // Save TXT records.
+      TextPtrs.emplace_back(I);
+      LLVM_DEBUG(dbgs() << "  --  TXT\n");
+      break;
     case GOFF::RT_END:
       LLVM_DEBUG(dbgs() << "  --  END (GOFF record type) unhandled\n");
       break;
@@ -364,6 +369,13 @@ GOFFObjectFile::getSymbolSection(DataRefImpl Symb) const {
                                std::to_string(SymEdId));
 }
 
+uint64_t GOFFObjectFile::getSymbolSize(DataRefImpl Symb) const {
+  const uint8_t *Record = getSymbolEsdRecord(Symb);
+  uint32_t Length;
+  ESDRecord::getLength(Record, Length);
+  return Length;
+}
+
 const uint8_t *GOFFObjectFile::getSectionEdEsdRecord(DataRefImpl &Sec) const {
   SectionEntryImpl EsdIds = SectionList[Sec.d.a];
   const uint8_t *EsdRecord = EsdPtrs[EsdIds.d.a];
@@ -394,6 +406,154 @@ GOFFObjectFile::getSectionPrEsdRecord(uint32_t SectionIndex) const {
   return EsdRecord;
 }
 
+uint32_t GOFFObjectFile::getSectionDefEsdId(DataRefImpl &Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  uint32_t Length;
+  ESDRecord::getLength(EsdRecord, Length);
+  if (Length == 0) {
+    const uint8_t *PrEsdRecord = getSectionPrEsdRecord(Sec);
+    if (PrEsdRecord)
+      EsdRecord = PrEsdRecord;
+  }
+
+  uint32_t DefEsdId;
+  ESDRecord::getEsdId(EsdRecord, DefEsdId);
+  LLVM_DEBUG(dbgs() << "Got def EsdId: " << DefEsdId << '\n');
+  return DefEsdId;
+}
+
+void GOFFObjectFile::moveSectionNext(DataRefImpl &Sec) const {
+  Sec.d.a++;
+  if ((Sec.d.a) >= SectionList.size())
+    Sec.d.a = 0;
+}
+
+Expected<StringRef> GOFFObjectFile::getSectionName(DataRefImpl Sec) const {
+  DataRefImpl EdSym;
+  SectionEntryImpl EsdIds = SectionList[Sec.d.a];
+  EdSym.d.a = EsdIds.d.a;
+  Expected<StringRef> Name = getSymbolName(EdSym);
+  if (Name) {
+    StringRef Res = *Name;
+    LLVM_DEBUG(dbgs() << "Got section: " << Res << '\n');
+    LLVM_DEBUG(dbgs() << "Final section name: " << Res << '\n');
+    Name = Res;
+  }
+  return Name;
+}
+
+uint64_t GOFFObjectFile::getSectionAddress(DataRefImpl Sec) const {
+  uint32_t Offset;
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  ESDRecord::getOffset(EsdRecord, Offset);
+  return Offset;
+}
+
+uint64_t GOFFObjectFile::getSectionSize(DataRefImpl Sec) const {
+  uint32_t Length;
+  uint32_t DefEsdId = getSectionDefEsdId(Sec);
+  const uint8_t *EsdRecord = EsdPtrs[DefEsdId];
+  ESDRecord::getLength(EsdRecord, Length);
+  LLVM_DEBUG(dbgs() << "Got section size: " << Length << '\n');
+  return static_cast<uint64_t>(Length);
+}
+
+// Unravel TXT records and expand fill characters to produce
+// a contiguous sequence of bytes.
+Expected<ArrayRef<uint8_t>>
+GOFFObjectFile::getSectionContents(DataRefImpl Sec) const {
+  if (SectionDataCache.count(Sec.d.a)) {
+    auto &Buf = SectionDataCache[Sec.d.a];
+    return ArrayRef<uint8_t>(Buf);
+  }
+  uint64_t SectionSize = getSectionSize(Sec);
+  uint32_t DefEsdId = getSectionDefEsdId(Sec);
+
+  const uint8_t *EdEsdRecord = getSectionEdEsdRecord(Sec);
+  bool FillBytePresent;
+  ESDRecord::getFillBytePresent(EdEsdRecord, FillBytePresent);
+  uint8_t FillByte = '\0';
+  if (FillBytePresent)
+    ESDRecord::getFillByteValue(EdEsdRecord, FillByte);
+
+  // Initialize section with fill byte.
+  SmallVector<uint8_t> Data(SectionSize, FillByte);
+
+  // Replace section with content from text records.
+  for (const uint8_t *TxtRecordInt : TextPtrs) {
+    const uint8_t *TxtRecordPtr = TxtRecordInt;
+    uint32_t TxtEsdId;
+    TXTRecord::getElementEsdId(TxtRecordPtr, TxtEsdId);
+    LLVM_DEBUG(dbgs() << "Got txt EsdId: " << TxtEsdId << '\n');
+
+    if (TxtEsdId != DefEsdId)
+      continue;
+
+    uint32_t TxtDataOffset;
+    TXTRecord::getOffset(TxtRecordPtr, TxtDataOffset);
+
+    uint16_t TxtDataSize;
+    TXTRecord::getDataLength(TxtRecordPtr, TxtDataSize);
+
+    LLVM_DEBUG(dbgs() << "Record offset " << TxtDataOffset << ", data size "
+                      << TxtDataSize << "\n");
+
+    SmallString<256> CompleteData;
+    CompleteData.reserve(TxtDataSize);
+    if (Error Err = TXTRecord::getData(TxtRecordPtr, CompleteData))
+      return std::move(Err);
+    assert(CompleteData.size() == TxtDataSize && "Wrong length of data");
+    std::copy(CompleteData.data(), CompleteData.data() + TxtDataSize,
+              Data.begin() + TxtDataOffset);
+  }
+  SectionDataCache[Sec.d.a] = Data;
+  return ArrayRef<uint8_t>(Data);
+}
+
+uint64_t GOFFObjectFile::getSectionAlignment(DataRefImpl Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDAlignment Pow2Alignment;
+  ESDRecord::getAlignment(EsdRecord, Pow2Alignment);
+  return 1 << static_cast<uint64_t>(Pow2Alignment);
+}
+
+bool GOFFObjectFile::isSectionText(DataRefImpl Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDExecutable Executable;
+  ESDRecord::getExecutable(EsdRecord, Executable);
+  return Executable == GOFF::ESD_EXE_CODE;
+}
+
+bool GOFFObjectFile::isSectionData(DataRefImpl Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDExecutable Executable;
+  ESDRecord::getExecutable(EsdRecord, Executable);
+  return Executable == GOFF::ESD_EXE_DATA;
+}
+
+bool GOFFObjectFile::isSectionNoLoad(DataRefImpl Sec) const {
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDLoadingBehavior LoadingBehavior;
+  ESDRecord::getLoadingBehavior(EsdRecord, LoadingBehavior);
+  return LoadingBehavior == GOFF::ESD_LB_NoLoad;
+}
+
+bool GOFFObjectFile::isSectionReadOnlyData(DataRefImpl Sec) const {
+  if (!isSectionData(Sec))
+    return false;
+
+  const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
+  GOFF::ESDLoadingBehavior LoadingBehavior;
+  ESDRecord::getLoadingBehavior(EsdRecord, LoadingBehavior);
+  return LoadingBehavior == GOFF::ESD_LB_Initial;
+}
+
+bool GOFFObjectFile::isSectionZeroInit(DataRefImpl Sec) const {
+  // GOFF uses fill characters and fill characters are applied
+  // on getSectionContents() - so we say false to zero init.
+  return false;
+}
+
 section_iterator GOFFObjectFile::section_begin() const {
   DataRefImpl Sec;
   moveSectionNext(Sec);
@@ -476,6 +636,13 @@ Error ESDRecord::getData(const uint8_t *Record,
   return getContinuousData(Record, DataSize, 72, CompleteData);
 }
 
+Error TXTRecord::getData(const uint8_t *Record,
+                         SmallString<256> &CompleteData) {
+  uint16_t Length;
+  getDataLength(Record, Length);
+  return getContinuousData(Record, Length, 24, CompleteData);
+}
+
 Error ENDRecord::getData(const uint8_t *Record,
                          SmallString<256> &CompleteData) {
   uint16_t Length = getNameLength(Record);
diff --git a/llvm/unittests/Object/GOFFObjectFileTest.cpp b/llvm/unittests/Object/GOFFObjectFileTest.cpp
index 734dac6b8507a7..69f60d016a8081 100644
--- a/llvm/unittests/Object/GOFFObjectFileTest.cpp
+++ b/llvm/unittests/Object/GOFFObjectFileTest.cpp
@@ -502,3 +502,100 @@ TEST(GOFFObjectFileTest, InvalidERSymbolType) {
         FailedWithMessage("ESD record 1 has unknown Executable type 0x03"));
   }
 }
+
+TEST(GOFFObjectFileTest, TXTConstruct) {
+  char GOFFData[GOFF::RecordLength * 6] = {};
+
+  // HDR record.
+  GOFFData[0] = 0x03;
+  GOFFData[1] = 0xF0;
+  GOFFData[50] = 0x01;
+
+  // ESD record.
+  GOFFData[GOFF::RecordLength] = 0x03;
+  GOFFData[GOFF::RecordLength + 7] = 0x01;  // ESDID.
+  GOFFData[GOFF::RecordLength + 71] = 0x05; // Size of symbol name.
+  GOFFData[GOFF::RecordLength + 72] = 0xa5; // Symbol name is v.
+  GOFFData[GOFF::RecordLength + 73] = 0x81; // Symbol name is a.
+  GOFFData[GOFF::RecordLength + 74] = 0x99; // Symbol name is r.
+  GOFFData[GOFF::RecordLength + 75] = 0x7b; // Symbol name is #.
+  GOFFData[GOFF::RecordLength + 76] = 0x83; // Symbol name is c.
+
+  // ESD record.
+  GOFFData[GOFF::RecordLength * 2] = 0x03;
+  GOFFData[GOFF::RecordLength * 2 + 3] = 0x01;
+  GOFFData[GOFF::RecordLength * 2 + 7] = 0x02;  // ESDID.
+  GOFFData[GOFF::RecordLength * 2 + 11] = 0x01; // Parent ESDID.
+  GOFFData[GOFF::RecordLength * 2 + 27] = 0x08; // Length.
+  GOFFData[GOFF::RecordLength * 2 + 40] = 0x01; // Name Space ID.
+  GOFFData[GOFF::RecordLength * 2 + 41] = 0x80;
+  GOFFData[GOFF::RecordLength * 2 + 60] = 0x04; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 61] = 0x04; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 63] = 0x0a; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 66] = 0x03; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 71] = 0x08; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 2 + 72] = 0xc3; // Symbol name is c.
+  GOFFData[GOFF::RecordLength * 2 + 73] = 0x6d; // Symbol name is _.
+  GOFFData[GOFF::RecordLength * 2 + 74] = 0xc3; // Symbol name is c.
+  GOFFData[GOFF::RecordLength * 2 + 75] = 0xd6; // Symbol name is o.
+  GOFFData[GOFF::RecordLength * 2 + 76] = 0xc4; // Symbol name is D.
+  GOFFData[GOFF::RecordLength * 2 + 77] = 0xc5; // Symbol name is E.
+  GOFFData[GOFF::RecordLength * 2 + 78] = 0xf6; // Symbol name is 6.
+  GOFFData[GOFF::RecordLength * 2 + 79] = 0xf4; // Symbol name is 4.
+
+  // ESD record.
+  GOFFData[GOFF::RecordLength * 3] = 0x03;
+  GOFFData[GOFF::RecordLength * 3 + 3] = 0x02;
+  GOFFData[GOFF::RecordLength * 3 + 7] = 0x03;  // ESDID.
+  GOFFData[GOFF::RecordLength * 3 + 11] = 0x02; // Parent ESDID.
+  GOFFData[GOFF::RecordLength * 3 + 71] = 0x05; // Size of symbol name.
+  GOFFData[GOFF::RecordLength * 3 + 72] = 0xa5; // Symbol name is v.
+  GOFFData[GOFF::RecordLength * 3 + 73] = 0x81; // Symbol name is a.
+  GOFFData[GOFF::RecordLength * 3 + 74] = 0x99; // Symbol name is r.
+  GOFFData[GOFF::RecordLength * 3 + 75] = 0x7b; // Symbol name is #.
+  GOFFData[GOFF::RecordLength * 3 + 76] = 0x83; // Symbol name is c.
+
+  // TXT record.
+  GOFFData[GOFF::RecordLength * 4] = 0x03;
+  GOFFData[GOFF::RecordLength * 4 + 1] = 0x10;
+  GOFFData[GOFF::RecordLength * 4 + 7] = 0x02;
+  GOFFData[GOFF::RecordLength * 4 + 23] = 0x08; // Data Length.
+  GOFFData[GOFF::RecordLength * 4 + 24] = 0x12;
+  GOFFData[GOFF::RecordLength * 4 + 25] = 0x34;
+  GOFFData[GOFF::RecordLength * 4 + 26] = 0x56;
+  GOFFData[GOFF::RecordLength * 4 + 27] = 0x78;
+  GOFFData[GOFF::RecordLength * 4 + 28] = 0x9a;
+  GOFFData[GOFF::RecordLength * 4 + 29] = 0xbc;
+  GOFFData[GOFF::RecordLength * 4 + 30] = 0xde;
+  GOFFData[GOFF::RecordLength * 4 + 31] = 0xf0;
+
+  // END record.
+  GOFFData[GOFF::RecordLength * 5] = 0x03;
+  GOFFData[GOFF::RecordLength * 5 + 1] = 0x40;
+  GOFFData[GOFF::RecordLength * 5 + 11] = 0x06;
+
+  StringRef Data(GOFFData, GOFF::RecordLength * 6);
+
+  Expected<std::unique_ptr<ObjectFile>> GOFFObjOrErr =
+      object::ObjectFile::createGOFFObjectFile(
+          MemoryBufferRef(Data, "dummyGOFF"));
+
+  ASSERT_THAT_EXPECTED(GOFFObjOrErr, Succeeded());
+
+  GOFFObjectFile *GOFFObj = dyn_cast<GOFFObjectFile>((*GOFFObjOrErr).get());
+  auto Symbols = GOFFObj->symbols();
+  ASSERT_EQ(std::distance(Symbols.begin(), Symbols.end()), 1);
+  SymbolRef Symbol = *Symbols.begin();
+  Expected<StringRef> SymbolNameOrErr = GOFFObj->getSymbolName(Symbol);
+  ASSERT_THAT_EXPECTED(SymbolNameOrErr, Succeeded());
+  StringRef SymbolName = SymbolNameOrErr.get();
+  EXPECT_EQ(SymbolName, "var#c");
+
+  auto Sections = GOFFObj->sections();
+  ASSERT_EQ(std::distance(Sections.begin(), Sections.end()), 1);
+  SectionRef Section = *Sections.begin();
+  Expected<StringRef> SectionContent = Section.getContents();
+  ASSERT_THAT_EXPECTED(SectionContent, Succeeded());
+  StringRef Contents = SectionContent.get();
+  EXPECT_EQ(Contents, "\x12\x34\x56\x78\x9a\xbc\xde\xf0");
+}

>From c388690a8b96cbdfa8c38a1e050088201da648e5 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Wed, 27 Mar 2024 16:54:50 +0100
Subject: [PATCH 37/54] [libc++][NFC] Simplify copy and move lowering to
 memmove a bit (#83574)

We've introduced `__constexpr_memmove` a while ago, which simplified the
implementation of the copy and move lowering a bit. This allows us to
remove some of the boilerplate.
---
 libcxx/include/__algorithm/copy.h             |  6 +--
 libcxx/include/__algorithm/copy_backward.h    |  6 +--
 libcxx/include/__algorithm/copy_move_common.h | 39 ++++---------------
 libcxx/include/__algorithm/move.h             |  6 +--
 libcxx/include/__algorithm/move_backward.h    |  6 +--
 5 files changed, 15 insertions(+), 48 deletions(-)

diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index 4c3815405af0cf..0890b895f54092 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -32,7 +32,7 @@ template <class, class _InIter, class _Sent, class _OutIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter);
 
 template <class _AlgPolicy>
-struct __copy_loop {
+struct __copy_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@@ -94,9 +94,7 @@ struct __copy_loop {
       __local_first = _Traits::__begin(++__segment_iterator);
     }
   }
-};
 
-struct __copy_trivial {
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -108,7 +106,7 @@ struct __copy_trivial {
 template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
 pair<_InIter, _OutIter> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
 __copy(_InIter __first, _Sent __last, _OutIter __result) {
-  return std::__dispatch_copy_or_move<_AlgPolicy, __copy_loop<_AlgPolicy>, __copy_trivial>(
+  return std::__copy_move_unwrap_iters<__copy_impl<_AlgPolicy> >(
       std::move(__first), std::move(__last), std::move(__result));
 }
 
diff --git a/libcxx/include/__algorithm/copy_backward.h b/libcxx/include/__algorithm/copy_backward.h
index 591dd21e2b032e..73dc846a975a44 100644
--- a/libcxx/include/__algorithm/copy_backward.h
+++ b/libcxx/include/__algorithm/copy_backward.h
@@ -33,7 +33,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
 __copy_backward(_InIter __first, _Sent __last, _OutIter __result);
 
 template <class _AlgPolicy>
-struct __copy_backward_loop {
+struct __copy_backward_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@@ -104,9 +104,7 @@ struct __copy_backward_loop {
       __local_last = _Traits::__end(__segment_iterator);
     }
   }
-};
 
-struct __copy_backward_trivial {
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -118,7 +116,7 @@ struct __copy_backward_trivial {
 template <class _AlgPolicy, class _BidirectionalIterator1, class _Sentinel, class _BidirectionalIterator2>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator1, _BidirectionalIterator2>
 __copy_backward(_BidirectionalIterator1 __first, _Sentinel __last, _BidirectionalIterator2 __result) {
-  return std::__dispatch_copy_or_move<_AlgPolicy, __copy_backward_loop<_AlgPolicy>, __copy_backward_trivial>(
+  return std::__copy_move_unwrap_iters<__copy_backward_impl<_AlgPolicy> >(
       std::move(__first), std::move(__last), std::move(__result));
 }
 
diff --git a/libcxx/include/__algorithm/copy_move_common.h b/libcxx/include/__algorithm/copy_move_common.h
index 845967b05038d6..12a26c6d6a64ee 100644
--- a/libcxx/include/__algorithm/copy_move_common.h
+++ b/libcxx/include/__algorithm/copy_move_common.h
@@ -81,30 +81,17 @@ __copy_backward_trivial_impl(_In* __first, _In* __last, _Out* __result) {
 
 // Iterator unwrapping and dispatching to the correct overload.
 
-template <class _F1, class _F2>
-struct __overload : _F1, _F2 {
-  using _F1::operator();
-  using _F2::operator();
-};
-
-template <class _InIter, class _Sent, class _OutIter, class = void>
-struct __can_rewrap : false_type {};
-
-template <class _InIter, class _Sent, class _OutIter>
-struct __can_rewrap<_InIter,
-                    _Sent,
-                    _OutIter,
-                    // Note that sentinels are always copy-constructible.
-                    __enable_if_t< is_copy_constructible<_InIter>::value && is_copy_constructible<_OutIter>::value > >
-    : true_type {};
+template <class _InIter, class _OutIter>
+struct __can_rewrap
+    : integral_constant<bool, is_copy_constructible<_InIter>::value && is_copy_constructible<_OutIter>::value> {};
 
 template <class _Algorithm,
           class _InIter,
           class _Sent,
           class _OutIter,
-          __enable_if_t<__can_rewrap<_InIter, _Sent, _OutIter>::value, int> = 0>
+          __enable_if_t<__can_rewrap<_InIter, _OutIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter>
-__unwrap_and_dispatch(_InIter __first, _Sent __last, _OutIter __out_first) {
+__copy_move_unwrap_iters(_InIter __first, _Sent __last, _OutIter __out_first) {
   auto __range  = std::__unwrap_range(__first, std::move(__last));
   auto __result = _Algorithm()(std::move(__range.first), std::move(__range.second), std::__unwrap_iter(__out_first));
   return std::make_pair(std::__rewrap_range<_Sent>(std::move(__first), std::move(__result.first)),
@@ -115,24 +102,12 @@ template <class _Algorithm,
           class _InIter,
           class _Sent,
           class _OutIter,
-          __enable_if_t<!__can_rewrap<_InIter, _Sent, _OutIter>::value, int> = 0>
+          __enable_if_t<!__can_rewrap<_InIter, _OutIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter>
-__unwrap_and_dispatch(_InIter __first, _Sent __last, _OutIter __out_first) {
+__copy_move_unwrap_iters(_InIter __first, _Sent __last, _OutIter __out_first) {
   return _Algorithm()(std::move(__first), std::move(__last), std::move(__out_first));
 }
 
-template <class _AlgPolicy,
-          class _NaiveAlgorithm,
-          class _OptimizedAlgorithm,
-          class _InIter,
-          class _Sent,
-          class _OutIter>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter>
-__dispatch_copy_or_move(_InIter __first, _Sent __last, _OutIter __out_first) {
-  using _Algorithm = __overload<_NaiveAlgorithm, _OptimizedAlgorithm>;
-  return std::__unwrap_and_dispatch<_Algorithm>(std::move(__first), std::move(__last), std::move(__out_first));
-}
-
 _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h
index bf574b5274093d..1716d43e2a613d 100644
--- a/libcxx/include/__algorithm/move.h
+++ b/libcxx/include/__algorithm/move.h
@@ -34,7 +34,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIte
 __move(_InIter __first, _Sent __last, _OutIter __result);
 
 template <class _AlgPolicy>
-struct __move_loop {
+struct __move_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@@ -95,9 +95,7 @@ struct __move_loop {
       __local_first = _Traits::__begin(++__segment_iterator);
     }
   }
-};
 
-struct __move_trivial {
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_move_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -109,7 +107,7 @@ struct __move_trivial {
 template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
 __move(_InIter __first, _Sent __last, _OutIter __result) {
-  return std::__dispatch_copy_or_move<_AlgPolicy, __move_loop<_AlgPolicy>, __move_trivial>(
+  return std::__copy_move_unwrap_iters<__move_impl<_AlgPolicy> >(
       std::move(__first), std::move(__last), std::move(__result));
 }
 
diff --git a/libcxx/include/__algorithm/move_backward.h b/libcxx/include/__algorithm/move_backward.h
index 6bb7c91d66c7ce..4beb7bdbaac0d0 100644
--- a/libcxx/include/__algorithm/move_backward.h
+++ b/libcxx/include/__algorithm/move_backward.h
@@ -33,7 +33,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator1
 __move_backward(_BidirectionalIterator1 __first, _Sentinel __last, _BidirectionalIterator2 __result);
 
 template <class _AlgPolicy>
-struct __move_backward_loop {
+struct __move_backward_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
   operator()(_InIter __first, _Sent __last, _OutIter __result) const {
@@ -104,9 +104,7 @@ struct __move_backward_loop {
       __local_last = _Traits::__end(--__segment_iterator);
     }
   }
-};
 
-struct __move_backward_trivial {
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_move_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -122,7 +120,7 @@ __move_backward(_BidirectionalIterator1 __first, _Sentinel __last, _Bidirectiona
                     std::is_copy_constructible<_BidirectionalIterator1>::value,
                 "Iterators must be copy constructible.");
 
-  return std::__dispatch_copy_or_move<_AlgPolicy, __move_backward_loop<_AlgPolicy>, __move_backward_trivial>(
+  return std::__copy_move_unwrap_iters<__move_backward_impl<_AlgPolicy> >(
       std::move(__first), std::move(__last), std::move(__result));
 }
 

>From 313bf28f98f714a0bd8f74a3beb4631d94428f89 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 27 Mar 2024 16:04:48 +0000
Subject: [PATCH 38/54] [ARM][MVE] Remove kill flags when reusing VPR register.
 (#86300)

The vpr register may no longer be killed where it was, so we should be
removing the kill flags.
---
 .../ARM/MVETPAndVPTOptimisationsPass.cpp      |  1 +
 .../CodeGen/Thumb2/mve-vpt-optimisations.mir  | 25 ++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 5c113ccfdc1579..e8d2cba7ee556f 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -958,6 +958,7 @@ bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
 
     unsigned NotImm = ~Imm & 0xffff;
     if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
+      MRI->clearKillFlags(LastVPTReg);
       Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
       if (MRI->use_empty(VPR)) {
         DeadInstructions.insert(Copy);
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir
index f28311e6563f41..f9b175ed80fbf3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode -run-pass arm-mve-vpt-opts %s -o - | FileCheck %s
+# RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode -run-pass arm-mve-vpt-opts -verify-machineinstrs %s -o - | FileCheck %s
 
 ---
 name:            vcmp_with_opposite_cond
@@ -1021,3 +1021,26 @@ body:             |
     %16:mqpr = MVE_VORR %15, %15, 1, %10, $noreg, undef %16
     %17:mqpr = MVE_VORR %16, %16, 1, %11, $noreg, undef %17
 ...
+---
+name:            reuse_kill_flags
+alignment:       4
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: reuse_kill_flags
+    ; CHECK: [[t2MOVi:%[0-9]+]]:tgpreven = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vccr = COPY [[t2MOVi]]
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:mqpr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[MVE_VORR:%[0-9]+]]:mqpr = MVE_VORR [[DEF]], [[DEF]], 1, [[COPY]], $noreg, undef [[MVE_VORR]]
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:mqpr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[MVE_VORR1:%[0-9]+]]:mqpr = MVE_VORR [[DEF1]], [[DEF1]], 1, killed [[COPY]], $noreg, undef [[MVE_VORR1]]
+    ; CHECK-NEXT: tBX_RET 14 /* CC::al */, $noreg, implicit [[DEF1]]
+    %0:tgpreven = t2MOVi 0, 14, $noreg, $noreg
+    %1:vccr = COPY %0:tgpreven
+    %2:mqpr = IMPLICIT_DEF
+    %3:mqpr = MVE_VORR %2:mqpr, %2:mqpr, 1, killed %1, $noreg, undef %3
+    %4:vccr = COPY %0:tgpreven
+    %5:mqpr = IMPLICIT_DEF
+    %6:mqpr = MVE_VORR %5:mqpr, %5:mqpr, 1, killed %4, $noreg, undef %6
+    tBX_RET 14 /* CC::al */, $noreg, implicit %5:mqpr
+
+...

>From 78f0871beed002187e65cc1334087596e9c11043 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Mar 2024 16:16:15 +0000
Subject: [PATCH 39/54] Revert rG58de1e2c5eee548a9b365e3b1554d87317072ad9 "Fix
 stack layout for frames larger than 2gb (#84114)"

This is failing on some EXPENSIVE_CHECKS buildbots
---
 llvm/include/llvm/CodeGen/MachineFrameInfo.h  | 14 +++----
 .../llvm/CodeGen/TargetFrameLowering.h        |  4 +-
 llvm/include/llvm/MC/MCAsmBackend.h           |  2 +-
 llvm/include/llvm/MC/MCDwarf.h                | 40 +++++++++----------
 llvm/lib/CodeGen/CFIInstrInserter.cpp         | 10 ++---
 llvm/lib/CodeGen/MachineFrameInfo.cpp         |  2 +-
 llvm/lib/CodeGen/PrologEpilogInserter.cpp     |  4 +-
 llvm/lib/MC/MCDwarf.cpp                       |  6 +--
 .../MCTargetDesc/AArch64AsmBackend.cpp        | 10 ++---
 llvm/lib/Target/ARM/ARMFrameLowering.cpp      |  4 +-
 .../Target/ARM/MCTargetDesc/ARMAsmBackend.cpp |  2 +-
 .../ARM/MCTargetDesc/ARMAsmBackendDarwin.h    |  2 +-
 .../Target/Hexagon/HexagonFrameLowering.cpp   |  4 +-
 .../lib/Target/MSP430/MSP430FrameLowering.cpp |  2 +-
 .../Target/X86/MCTargetDesc/X86AsmBackend.cpp | 13 +++---
 .../X86/MCTargetDesc/X86MCCodeEmitter.cpp     | 14 +++----
 llvm/lib/Target/X86/X86FrameLowering.cpp      | 28 ++++++-------
 llvm/lib/Target/X86/X86FrameLowering.h        |  5 +--
 llvm/lib/Target/X86/X86RegisterInfo.cpp       | 10 ++---
 llvm/test/CodeGen/PowerPC/huge-frame-size.ll  |  2 +-
 llvm/test/CodeGen/X86/huge-stack.ll           | 24 -----------
 21 files changed, 86 insertions(+), 116 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/huge-stack.ll

diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index ad6142b46515bf..0fe73fec7ee67f 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -251,7 +251,7 @@ class MachineFrameInfo {
   /// targets, this value is only used when generating debug info (via
   /// TargetRegisterInfo::getFrameIndexReference); when generating code, the
   /// corresponding adjustments are performed directly.
-  int64_t OffsetAdjustment = 0;
+  int OffsetAdjustment = 0;
 
   /// The prolog/epilog code inserter may process objects that require greater
   /// alignment than the default alignment the target provides.
@@ -280,7 +280,7 @@ class MachineFrameInfo {
   /// setup/destroy pseudo instructions (as defined in the TargetFrameInfo
   /// class).  This information is important for frame pointer elimination.
   /// It is only valid during and after prolog/epilog code insertion.
-  uint64_t MaxCallFrameSize = ~UINT64_C(0);
+  unsigned MaxCallFrameSize = ~0u;
 
   /// The number of bytes of callee saved registers that the target wants to
   /// report for the current function in the CodeView S_FRAMEPROC record.
@@ -591,10 +591,10 @@ class MachineFrameInfo {
   uint64_t estimateStackSize(const MachineFunction &MF) const;
 
   /// Return the correction for frame offsets.
-  int64_t getOffsetAdjustment() const { return OffsetAdjustment; }
+  int getOffsetAdjustment() const { return OffsetAdjustment; }
 
   /// Set the correction for frame offsets.
-  void setOffsetAdjustment(int64_t Adj) { OffsetAdjustment = Adj; }
+  void setOffsetAdjustment(int Adj) { OffsetAdjustment = Adj; }
 
   /// Return the alignment in bytes that this function must be aligned to,
   /// which is greater than the default stack alignment provided by the target.
@@ -655,7 +655,7 @@ class MachineFrameInfo {
   /// CallFrameSetup/Destroy pseudo instructions are used by the target, and
   /// then only during or after prolog/epilog code insertion.
   ///
-  uint64_t getMaxCallFrameSize() const {
+  unsigned getMaxCallFrameSize() const {
     // TODO: Enable this assert when targets are fixed.
     //assert(isMaxCallFrameSizeComputed() && "MaxCallFrameSize not computed yet");
     if (!isMaxCallFrameSizeComputed())
@@ -663,9 +663,9 @@ class MachineFrameInfo {
     return MaxCallFrameSize;
   }
   bool isMaxCallFrameSizeComputed() const {
-    return MaxCallFrameSize != ~UINT64_C(0);
+    return MaxCallFrameSize != ~0u;
   }
-  void setMaxCallFrameSize(uint64_t S) { MaxCallFrameSize = S; }
+  void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
 
   /// Returns how many bytes of callee-saved registers the target pushed in the
   /// prologue. Only used for debug info.
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 72978b2f746d76..0b9cacecc7cbe1 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -51,7 +51,7 @@ class TargetFrameLowering {
   // Maps a callee saved register to a stack slot with a fixed offset.
   struct SpillSlot {
     unsigned Reg;
-    int64_t Offset; // Offset relative to stack pointer on function entry.
+    int Offset; // Offset relative to stack pointer on function entry.
   };
 
   struct DwarfFrameBase {
@@ -66,7 +66,7 @@ class TargetFrameLowering {
       // Used with FrameBaseKind::Register.
       unsigned Reg;
       // Used with FrameBaseKind::CFA.
-      int64_t Offset;
+      int Offset;
       struct WasmFrameBase WasmLoc;
     } Location;
   };
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 689e3cd5dbf206..01a64fb425a94f 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -232,7 +232,7 @@ class MCAsmBackend {
   virtual void handleAssemblerFlag(MCAssemblerFlag Flag) {}
 
   /// Generate the compact unwind encoding for the CFI instructions.
-  virtual uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  virtual uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                                  const MCContext *Ctxt) const {
     return 0;
   }
diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h
index 150b48eedc3715..18056c5fdf816a 100644
--- a/llvm/include/llvm/MC/MCDwarf.h
+++ b/llvm/include/llvm/MC/MCDwarf.h
@@ -508,7 +508,7 @@ class MCCFIInstruction {
   MCSymbol *Label;
   unsigned Register;
   union {
-    int64_t Offset;
+    int Offset;
     unsigned Register2;
   };
   unsigned AddressSpace = ~0u;
@@ -516,7 +516,7 @@ class MCCFIInstruction {
   std::vector<char> Values;
   std::string Comment;
 
-  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int64_t O, SMLoc Loc,
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, SMLoc Loc,
                    StringRef V = "", StringRef Comment = "")
       : Operation(Op), Label(L), Register(R), Offset(O), Loc(Loc),
         Values(V.begin(), V.end()), Comment(Comment) {
@@ -528,7 +528,7 @@ class MCCFIInstruction {
     assert(Op == OpRegister);
   }
 
-  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int64_t O, unsigned AS,
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, unsigned AS,
                    SMLoc Loc)
       : Operation(Op), Label(L), Register(R), Offset(O), AddressSpace(AS),
         Loc(Loc) {
@@ -538,8 +538,8 @@ class MCCFIInstruction {
 public:
   /// .cfi_def_cfa defines a rule for computing CFA as: take address from
   /// Register and add Offset to it.
-  static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register,
-                                    int64_t Offset, SMLoc Loc = {}) {
+  static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int Offset,
+                                    SMLoc Loc = {}) {
     return MCCFIInstruction(OpDefCfa, L, Register, Offset, Loc);
   }
 
@@ -547,13 +547,13 @@ class MCCFIInstruction {
   /// on Register will be used instead of the old one. Offset remains the same.
   static MCCFIInstruction createDefCfaRegister(MCSymbol *L, unsigned Register,
                                                SMLoc Loc = {}) {
-    return MCCFIInstruction(OpDefCfaRegister, L, Register, INT64_C(0), Loc);
+    return MCCFIInstruction(OpDefCfaRegister, L, Register, 0, Loc);
   }
 
   /// .cfi_def_cfa_offset modifies a rule for computing CFA. Register
   /// remains the same, but offset is new. Note that it is the absolute offset
   /// that will be added to a defined register to the compute CFA address.
-  static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset,
+  static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int Offset,
                                           SMLoc Loc = {}) {
     return MCCFIInstruction(OpDefCfaOffset, L, 0, Offset, Loc);
   }
@@ -561,7 +561,7 @@ class MCCFIInstruction {
   /// .cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but
   /// Offset is a relative value that is added/subtracted from the previous
   /// offset.
-  static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment,
+  static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int Adjustment,
                                                 SMLoc Loc = {}) {
     return MCCFIInstruction(OpAdjustCfaOffset, L, 0, Adjustment, Loc);
   }
@@ -581,7 +581,7 @@ class MCCFIInstruction {
   /// .cfi_offset Previous value of Register is saved at offset Offset
   /// from CFA.
   static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register,
-                                       int64_t Offset, SMLoc Loc = {}) {
+                                       int Offset, SMLoc Loc = {}) {
     return MCCFIInstruction(OpOffset, L, Register, Offset, Loc);
   }
 
@@ -589,7 +589,7 @@ class MCCFIInstruction {
   /// Offset from the current CFA register. This is transformed to .cfi_offset
   /// using the known displacement of the CFA register from the CFA.
   static MCCFIInstruction createRelOffset(MCSymbol *L, unsigned Register,
-                                          int64_t Offset, SMLoc Loc = {}) {
+                                          int Offset, SMLoc Loc = {}) {
     return MCCFIInstruction(OpRelOffset, L, Register, Offset, Loc);
   }
 
@@ -602,12 +602,12 @@ class MCCFIInstruction {
 
   /// .cfi_window_save SPARC register window is saved.
   static MCCFIInstruction createWindowSave(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpWindowSave, L, 0, INT64_C(0), Loc);
+    return MCCFIInstruction(OpWindowSave, L, 0, 0, Loc);
   }
 
   /// .cfi_negate_ra_state AArch64 negate RA state.
   static MCCFIInstruction createNegateRAState(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpNegateRAState, L, 0, INT64_C(0), Loc);
+    return MCCFIInstruction(OpNegateRAState, L, 0, 0, Loc);
   }
 
   /// .cfi_restore says that the rule for Register is now the same as it
@@ -615,31 +615,31 @@ class MCCFIInstruction {
   /// by .cfi_startproc were executed.
   static MCCFIInstruction createRestore(MCSymbol *L, unsigned Register,
                                         SMLoc Loc = {}) {
-    return MCCFIInstruction(OpRestore, L, Register, INT64_C(0), Loc);
+    return MCCFIInstruction(OpRestore, L, Register, 0, Loc);
   }
 
   /// .cfi_undefined From now on the previous value of Register can't be
   /// restored anymore.
   static MCCFIInstruction createUndefined(MCSymbol *L, unsigned Register,
                                           SMLoc Loc = {}) {
-    return MCCFIInstruction(OpUndefined, L, Register, INT64_C(0), Loc);
+    return MCCFIInstruction(OpUndefined, L, Register, 0, Loc);
   }
 
   /// .cfi_same_value Current value of Register is the same as in the
   /// previous frame. I.e., no restoration is needed.
   static MCCFIInstruction createSameValue(MCSymbol *L, unsigned Register,
                                           SMLoc Loc = {}) {
-    return MCCFIInstruction(OpSameValue, L, Register, INT64_C(0), Loc);
+    return MCCFIInstruction(OpSameValue, L, Register, 0, Loc);
   }
 
   /// .cfi_remember_state Save all current rules for all registers.
   static MCCFIInstruction createRememberState(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpRememberState, L, 0, INT64_C(0), Loc);
+    return MCCFIInstruction(OpRememberState, L, 0, 0, Loc);
   }
 
   /// .cfi_restore_state Restore the previously saved state.
   static MCCFIInstruction createRestoreState(MCSymbol *L, SMLoc Loc = {}) {
-    return MCCFIInstruction(OpRestoreState, L, 0, INT64_C(0), Loc);
+    return MCCFIInstruction(OpRestoreState, L, 0, 0, Loc);
   }
 
   /// .cfi_escape Allows the user to add arbitrary bytes to the unwind
@@ -650,7 +650,7 @@ class MCCFIInstruction {
   }
 
   /// A special wrapper for .cfi_escape that indicates GNU_ARGS_SIZE
-  static MCCFIInstruction createGnuArgsSize(MCSymbol *L, int64_t Size,
+  static MCCFIInstruction createGnuArgsSize(MCSymbol *L, int Size,
                                             SMLoc Loc = {}) {
     return MCCFIInstruction(OpGnuArgsSize, L, 0, Size, Loc);
   }
@@ -677,7 +677,7 @@ class MCCFIInstruction {
     return AddressSpace;
   }
 
-  int64_t getOffset() const {
+  int getOffset() const {
     assert(Operation == OpDefCfa || Operation == OpOffset ||
            Operation == OpRelOffset || Operation == OpDefCfaOffset ||
            Operation == OpAdjustCfaOffset || Operation == OpGnuArgsSize ||
@@ -705,7 +705,7 @@ struct MCDwarfFrameInfo {
   unsigned CurrentCfaRegister = 0;
   unsigned PersonalityEncoding = 0;
   unsigned LsdaEncoding = 0;
-  uint64_t CompactUnwindEncoding = 0;
+  uint32_t CompactUnwindEncoding = 0;
   bool IsSignalFrame = false;
   bool IsSimple = false;
   unsigned RAReg = static_cast<unsigned>(INT_MAX);
diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp
index 776cc13ccd20b2..87b062a16df1d2 100644
--- a/llvm/lib/CodeGen/CFIInstrInserter.cpp
+++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp
@@ -68,9 +68,9 @@ class CFIInstrInserter : public MachineFunctionPass {
   struct MBBCFAInfo {
     MachineBasicBlock *MBB;
     /// Value of cfa offset valid at basic block entry.
-    int64_t IncomingCFAOffset = -1;
+    int IncomingCFAOffset = -1;
     /// Value of cfa offset valid at basic block exit.
-    int64_t OutgoingCFAOffset = -1;
+    int OutgoingCFAOffset = -1;
     /// Value of cfa register valid at basic block entry.
     unsigned IncomingCFARegister = 0;
     /// Value of cfa register valid at basic block exit.
@@ -120,7 +120,7 @@ class CFIInstrInserter : public MachineFunctionPass {
   /// Return the cfa offset value that should be set at the beginning of a MBB
   /// if needed. The negated value is needed when creating CFI instructions that
   /// set absolute offset.
-  int64_t getCorrectCFAOffset(MachineBasicBlock *MBB) {
+  int getCorrectCFAOffset(MachineBasicBlock *MBB) {
     return MBBVector[MBB->getNumber()].IncomingCFAOffset;
   }
 
@@ -175,7 +175,7 @@ void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) {
 
 void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
   // Outgoing cfa offset set by the block.
-  int64_t SetOffset = MBBInfo.IncomingCFAOffset;
+  int SetOffset = MBBInfo.IncomingCFAOffset;
   // Outgoing cfa register set by the block.
   unsigned SetRegister = MBBInfo.IncomingCFARegister;
   MachineFunction *MF = MBBInfo.MBB->getParent();
@@ -188,7 +188,7 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
   for (MachineInstr &MI : *MBBInfo.MBB) {
     if (MI.isCFIInstruction()) {
       std::optional<unsigned> CSRReg;
-      std::optional<int64_t> CSROffset;
+      std::optional<int> CSROffset;
       unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
       const MCCFIInstruction &CFI = Instrs[CFIIndex];
       switch (CFI.getOperation()) {
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index e4b993850f73dc..853de4c88caeb7 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -197,7 +197,7 @@ void MachineFrameInfo::computeMaxCallFrameSize(
     for (MachineInstr &MI : MBB) {
       unsigned Opcode = MI.getOpcode();
       if (Opcode == FrameSetupOpcode || Opcode == FrameDestroyOpcode) {
-        uint64_t Size = TII.getFrameSize(MI);
+        unsigned Size = TII.getFrameSize(MI);
         MaxCallFrameSize = std::max(MaxCallFrameSize, Size);
         if (FrameSDOps != nullptr)
           FrameSDOps->push_back(&MI);
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 9771825ed875b0..eaf96ec5cbde8c 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -366,8 +366,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &MF) {
     return;
 
   // (Re-)Compute the MaxCallFrameSize.
-  [[maybe_unused]] uint64_t MaxCFSIn =
-      MFI.isMaxCallFrameSizeComputed() ? MFI.getMaxCallFrameSize() : UINT64_MAX;
+  [[maybe_unused]] uint32_t MaxCFSIn =
+      MFI.isMaxCallFrameSizeComputed() ? MFI.getMaxCallFrameSize() : UINT32_MAX;
   std::vector<MachineBasicBlock::iterator> FrameSDOps;
   MFI.computeMaxCallFrameSize(MF, &FrameSDOps);
   assert(MFI.getMaxCallFrameSize() <= MaxCFSIn &&
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 9b8ec9bf2af0b9..2ee0c3eb27b92e 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -1298,8 +1298,8 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
 namespace {
 
 class FrameEmitterImpl {
-  int64_t CFAOffset = 0;
-  int64_t InitialCFAOffset = 0;
+  int CFAOffset = 0;
+  int InitialCFAOffset = 0;
   bool IsEH;
   MCObjectStreamer &Streamer;
 
@@ -1413,7 +1413,7 @@ void FrameEmitterImpl::emitCFIInstruction(const MCCFIInstruction &Instr) {
     if (!IsEH)
       Reg = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg);
 
-    int64_t Offset = Instr.getOffset();
+    int Offset = Instr.getOffset();
     if (IsRelative)
       Offset -= CFAOffset;
     Offset = Offset / dataAlignmentFactor;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index d83f7b5690eec6..30ef3680ae79c9 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -584,7 +584,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
   /// Encode compact unwind stack adjustment for frameless functions.
   /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
   /// The stack size always needs to be 16 byte aligned.
-  uint64_t encodeStackAdjustment(uint64_t StackSize) const {
+  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
     return (StackSize / 16) << 12;
   }
 
@@ -602,7 +602,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
   }
 
   /// Generate the compact unwind encoding from the CFI directives.
-  uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                          const MCContext *Ctxt) const override {
     ArrayRef<MCCFIInstruction> Instrs = FI->Instructions;
     if (Instrs.empty())
@@ -612,10 +612,10 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
       return CU::UNWIND_ARM64_MODE_DWARF;
 
     bool HasFP = false;
-    uint64_t StackSize = 0;
+    unsigned StackSize = 0;
 
-    uint64_t CompactUnwindEncoding = 0;
-    int64_t CurOffset = 0;
+    uint32_t CompactUnwindEncoding = 0;
+    int CurOffset = 0;
     for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
       const MCCFIInstruction &Inst = Instrs[i];
 
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index a1012f3996e76b..9b54dd4e4e618d 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1165,7 +1165,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
         if (STI.splitFramePushPop(MF)) {
           unsigned DwarfReg = MRI->getDwarfRegNum(
               Reg == ARM::R12 ? ARM::RA_AUTH_CODE : Reg, true);
-          uint64_t Offset = MFI.getObjectOffset(FI);
+          unsigned Offset = MFI.getObjectOffset(FI);
           unsigned CFIIndex = MF.addFrameInst(
               MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
           BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1187,7 +1187,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
           (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
         unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-        uint64_t Offset = MFI.getObjectOffset(FI);
+        unsigned Offset = MFI.getObjectOffset(FI);
         unsigned CFIIndex = MF.addFrameInst(
             MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
         BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 9671f69bfd2268..6cd4badb7704b7 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1148,7 +1148,7 @@ enum CompactUnwindEncodings {
 /// instructions. If the CFI instructions describe a frame that cannot be
 /// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which
 /// tells the runtime to fallback and unwind using dwarf.
-uint64_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
+uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
     const MCDwarfFrameInfo *FI, const MCContext *Ctxt) const {
   DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n");
   // Only armv7k uses CFI based unwinding.
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index 9c958003ca756a..ac0c9b101cae13 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -34,7 +34,7 @@ class ARMAsmBackendDarwin : public ARMAsmBackend {
         /*Is64Bit=*/false, cantFail(MachO::getCPUType(TT)), Subtype);
   }
 
-  uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                          const MCContext *Ctxt) const override;
 };
 } // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 394456c13e6812..232651132d6e4f 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1660,7 +1660,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
   using SpillSlot = TargetFrameLowering::SpillSlot;
 
   unsigned NumFixed;
-  int64_t MinOffset = 0; // CS offsets are negative.
+  int MinOffset = 0;  // CS offsets are negative.
   const SpillSlot *FixedSlots = getCalleeSavedSpillSlots(NumFixed);
   for (const SpillSlot *S = FixedSlots; S != FixedSlots+NumFixed; ++S) {
     if (!SRegs[S->Reg])
@@ -1679,7 +1679,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
     Register R = x;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R);
     unsigned Size = TRI->getSpillSize(*RC);
-    int64_t Off = MinOffset - Size;
+    int Off = MinOffset - Size;
     Align Alignment = std::min(TRI->getSpillAlign(*RC), getStackAlign());
     Off &= -Alignment.value();
     int FI = MFI.CreateFixedSpillStackObject(Size, Off);
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index 6acbcf5cda2423..176387d71fcb6c 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -294,7 +294,7 @@ void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (!hasFP(MF)) {
     MBBI = FirstCSPop;
-    int64_t Offset = -(int64_t)CSSize - 2;
+    int64_t Offset = -CSSize - 2;
     // Mark callee-saved pop instruction.
     // Define the current CFA rule to use the provided offset.
     while (MBBI != MBB.end()) {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 23bff777df6e23..99dc9797f6df92 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -1328,7 +1328,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
 
   /// Implementation of algorithm to generate the compact unwind encoding
   /// for the CFI instructions.
-  uint64_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
+  uint32_t generateCompactUnwindEncoding(const MCDwarfFrameInfo *FI,
                                          const MCContext *Ctxt) const override {
     ArrayRef<MCCFIInstruction> Instrs = FI->Instructions;
     if (Instrs.empty()) return 0;
@@ -1343,13 +1343,13 @@ class DarwinX86AsmBackend : public X86AsmBackend {
     bool HasFP = false;
 
     // Encode that we are using EBP/RBP as the frame pointer.
-    uint64_t CompactUnwindEncoding = 0;
+    uint32_t CompactUnwindEncoding = 0;
 
     unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
     unsigned InstrOffset = 0;
     unsigned StackAdjust = 0;
-    uint64_t StackSize = 0;
-    int64_t MinAbsOffset = std::numeric_limits<int64_t>::max();
+    unsigned StackSize = 0;
+    int MinAbsOffset = std::numeric_limits<int>::max();
 
     for (const MCCFIInstruction &Inst : Instrs) {
       switch (Inst.getOperation()) {
@@ -1376,7 +1376,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
         memset(SavedRegs, 0, sizeof(SavedRegs));
         StackAdjust = 0;
         SavedRegIdx = 0;
-        MinAbsOffset = std::numeric_limits<int64_t>::max();
+        MinAbsOffset = std::numeric_limits<int>::max();
         InstrOffset += MoveInstrSize;
         break;
       }
@@ -1419,8 +1419,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
         unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
         SavedRegs[SavedRegIdx++] = Reg;
         StackAdjust += OffsetSize;
-        MinAbsOffset =
-            std::min<int64_t>(MinAbsOffset, std::abs(Inst.getOffset()));
+        MinAbsOffset = std::min(MinAbsOffset, abs(Inst.getOffset()));
         InstrOffset += PushInstrSize(Reg);
         break;
       }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 1df2b86349a214..92a14226a0dc05 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -358,8 +358,7 @@ class X86MCCodeEmitter : public MCCodeEmitter {
   void emitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize,
                      MCFixupKind FixupKind, uint64_t StartByte,
                      SmallVectorImpl<char> &CB,
-                     SmallVectorImpl<MCFixup> &Fixups,
-                     int64_t ImmOffset = 0) const;
+                     SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
 
   void emitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
                         SmallVectorImpl<char> &CB) const;
@@ -413,8 +412,7 @@ static void emitConstant(uint64_t Val, unsigned Size,
 /// Determine if this immediate can fit in a disp8 or a compressed disp8 for
 /// EVEX instructions. \p will be set to the value to pass to the ImmOffset
 /// parameter of emitImmediate.
-static bool isDispOrCDisp8(uint64_t TSFlags, int64_t Value,
-                           int64_t &ImmOffset) {
+static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) {
   bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
 
   unsigned CD8_Scale =
@@ -427,7 +425,7 @@ static bool isDispOrCDisp8(uint64_t TSFlags, int64_t Value,
   if (Value & (CD8_Scale - 1)) // Unaligned offset
     return false;
 
-  int64_t CDisp8 = Value / static_cast<int64_t>(CD8_Scale);
+  int CDisp8 = Value / static_cast<int>(CD8_Scale);
   if (!isInt<8>(CDisp8))
     return false;
 
@@ -520,7 +518,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
                                      uint64_t StartByte,
                                      SmallVectorImpl<char> &CB,
                                      SmallVectorImpl<MCFixup> &Fixups,
-                                     int64_t ImmOffset) const {
+                                     int ImmOffset) const {
   const MCExpr *Expr = nullptr;
   if (DispOp.isImm()) {
     // If this is a simple integer displacement that doesn't require a
@@ -801,7 +799,7 @@ void X86MCCodeEmitter::emitMemModRMByte(
     // This also handles the 0 displacement for [EBP], [R13], [R21] or [R29]. We
     // can't use disp8 if the {disp32} pseudo prefix is present.
     if (Disp.isImm() && AllowDisp8) {
-      int64_t ImmOffset = 0;
+      int ImmOffset = 0;
       if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
         emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CB);
         emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, CB, Fixups,
@@ -828,7 +826,7 @@ void X86MCCodeEmitter::emitMemModRMByte(
 
   bool ForceDisp32 = false;
   bool ForceDisp8 = false;
-  int64_t ImmOffset = 0;
+  int ImmOffset = 0;
   if (BaseReg == 0) {
     // If there is no base register, we emit the special case SIB byte with
     // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 3e44ed621fdff4..d914e1b61ab075 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -380,9 +380,9 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
   return MI;
 }
 
-int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
-                                         MachineBasicBlock::iterator &MBBI,
-                                         bool doMergeWithPrevious) const {
+int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator &MBBI,
+                                     bool doMergeWithPrevious) const {
   if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
       (!doMergeWithPrevious && MBBI == MBB.end()))
     return 0;
@@ -405,7 +405,7 @@ int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
     PI = std::prev(PI);
 
   unsigned Opc = PI->getOpcode();
-  int64_t Offset = 0;
+  int Offset = 0;
 
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
       PI->getOperand(0).getReg() == StackPtr) {
@@ -473,7 +473,7 @@ void X86FrameLowering::emitCalleeSavedFrameMovesFullCFA(
                                : FramePtr;
   unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
   // Offset = space for return address + size of the frame pointer itself.
-  int64_t Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
+  unsigned Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
   BuildCFI(MBB, MBBI, DebugLoc{},
            MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset));
   emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
@@ -1881,7 +1881,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
 
   // For EH funclets, only allocate enough space for outgoing calls. Save the
   // NumBytes value that we would've used for the parent frame.
-  uint64_t ParentFrameNumBytes = NumBytes;
+  unsigned ParentFrameNumBytes = NumBytes;
   if (IsFunclet)
     NumBytes = getWinEHFuncletFrameSize(MF);
 
@@ -2430,7 +2430,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   if (HasFP) {
     if (X86FI->hasSwiftAsyncContext()) {
       // Discard the context.
-      int64_t Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
+      int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
       emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
     }
     // Pop EBP.
@@ -2562,7 +2562,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (!HasFP && NeedsDwarfCFI) {
     MBBI = FirstCSPop;
-    int64_t Offset = -(int64_t)CSSize - SlotSize;
+    int64_t Offset = -CSSize - SlotSize;
     // Mark callee-saved pop instruction.
     // Define the current CFA rule to use the provided offset.
     while (MBBI != MBB.end()) {
@@ -2591,7 +2591,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
     // Add the return addr area delta back since we are not tail calling.
-    int64_t Offset = -1 * X86FI->getTCReturnAddrDelta();
+    int Offset = -1 * X86FI->getTCReturnAddrDelta();
     assert(Offset >= 0 && "TCDelta should never be positive");
     if (Offset) {
       // Check for possible merge with preceding ADD instruction.
@@ -2625,7 +2625,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
   // object.
   // We need to factor in additional offsets applied during the prologue to the
   // frame, base, and stack pointer depending on which is used.
-  int64_t Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+  int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
   const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t StackSize = MFI.getStackSize();
@@ -3919,7 +3919,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
   // FIXME: Don't set FrameSetup flag in catchret case.
 
   int FI = FuncInfo.EHRegNodeFrameIndex;
-  int64_t EHRegSize = MFI.getObjectSize(FI);
+  int EHRegSize = MFI.getObjectSize(FI);
 
   if (RestoreSP) {
     // MOV32rm -EHRegSize(%ebp), %esp
@@ -3929,8 +3929,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
   }
 
   Register UsedReg;
-  int64_t EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
-  int64_t EndOffset = -EHRegOffset - EHRegSize;
+  int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
+  int EndOffset = -EHRegOffset - EHRegSize;
   FuncInfo.EHRegNodeEndOffset = EndOffset;
 
   if (UsedReg == FramePtr) {
@@ -3951,7 +3951,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
         .setMIFlag(MachineInstr::FrameSetup);
     // MOV32rm SavedEBPOffset(%esi), %ebp
     assert(X86FI->getHasSEHFramePtrSave());
-    int64_t Offset =
+    int Offset =
         getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
             .getFixed();
     assert(UsedReg == BasePtr);
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 49580b31d39c7b..2dc9ecc6109d78 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -137,9 +137,8 @@ class X86FrameLowering : public TargetFrameLowering {
   /// it is an ADD/SUB/LEA instruction it is deleted argument and the
   /// stack adjustment is returned as a positive value for ADD/LEA and
   /// a negative for SUB.
-  int64_t mergeSPUpdates(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI,
-                         bool doMergeWithPrevious) const;
+  int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                     bool doMergeWithPrevious) const;
 
   /// Emit a series of instructions to increment / decrement the stack
   /// pointer by a constant value.
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 57f645462089ed..be0cf1596d0d90 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -893,7 +893,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
   // Determine base register and offset.
-  int64_t FIOffset;
+  int FIOffset;
   Register BasePtr;
   if (MI.isReturn()) {
     assert((!hasStackRealignment(MF) ||
@@ -946,11 +946,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (MI.getOperand(FIOperandNum+3).isImm()) {
     // Offset is a 32-bit integer.
     int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
-    int64_t Offset = FIOffset + Imm;
-    if (!Is64Bit) {
-      assert(isInt<32>((long long)FIOffset + Imm) &&
-             "Requesting 64-bit offset in 32-bit immediate!");
-    }
+    int Offset = FIOffset + Imm;
+    assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
+           "Requesting 64-bit offset in 32-bit immediate!");
     if (Offset != 0 || !tryOptimizeLEAtoMOV(II))
       MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
   } else {
diff --git a/llvm/test/CodeGen/PowerPC/huge-frame-size.ll b/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
index 78bdac021ac8af..f1039df6f549ae 100644
--- a/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
+++ b/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
@@ -18,7 +18,7 @@ define void @foo(i8 %x) {
 ; CHECK-LE-NEXT:    oris 0, 0, 65535
 ; CHECK-LE-NEXT:    ori 0, 0, 65504
 ; CHECK-LE-NEXT:    stdux 1, 1, 0
-; CHECK-LE-NEXT:    .cfi_def_cfa_offset 4294967328
+; CHECK-LE-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-LE-NEXT:    li 4, 1
 ; CHECK-LE-NEXT:    addi 5, 1, 32
 ; CHECK-LE-NEXT:    stb 3, 32(1)
diff --git a/llvm/test/CodeGen/X86/huge-stack.ll b/llvm/test/CodeGen/X86/huge-stack.ll
deleted file mode 100644
index 4596c50382a08e..00000000000000
--- a/llvm/test/CodeGen/X86/huge-stack.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 4
-; RUN: llc -O0 -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
-%large = type [4294967295 x i8]
-
-define void @foo() unnamed_addr #0 {
-; CHECK-LABEL: foo:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $8589934462, %rax # imm = 0x1FFFFFF7E
-; CHECK-NEXT:    subq %rax, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8589934470
-; CHECK-NEXT:    movb $42, 4294967167(%rsp)
-; CHECK-NEXT:    movb $43, -128(%rsp)
-; CHECK-NEXT:    movabsq $8589934462, %rax # imm = 0x1FFFFFF7E
-; CHECK-NEXT:    addq %rax, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
-  %1 = alloca %large, align 1
-  %2 = alloca %large, align 1
-  %3 = getelementptr inbounds %large, ptr %1, i64 0, i64 0
-  store i8 42, ptr %3, align 1
-  %4 = getelementptr inbounds %large, ptr %2, i64 0, i64 0
-  store i8 43, ptr %4, align 1
-  ret void
-}

>From 77118536b52bf5256eed85f61451d0beb6cf5dc3 Mon Sep 17 00:00:00 2001
From: Marc Auberer <marc.auberer at chillibits.com>
Date: Wed, 27 Mar 2024 17:22:41 +0100
Subject: [PATCH 40/54] [libc] Remove obsolete LIBC_HAS_BUILTIN macro (#86554)

Fixes #86546 and removes the macro `LIBC_HAS_BUILTIN`. This was
necessary to support older compilers that did not support
`__has_builtin`. All of the compilers we support already have this
builtin.
See: https://libc.llvm.org/compiler_support.html
All uses now use `__has_builtin` directly

cc @nickdesaulniers
---
 libc/docs/dev/code_style.rst                  |  2 +-
 libc/src/__support/CPP/CMakeLists.txt         |  2 -
 libc/src/__support/CPP/atomic.h               | 74 ++++++++++---------
 libc/src/__support/CPP/bit.h                  | 15 ++--
 .../__support/CPP/type_traits/add_pointer.h   |  1 -
 libc/src/__support/CPP/type_traits/decay.h    |  1 -
 .../CPP/type_traits/is_destructible.h         |  3 +-
 .../__support/CPP/type_traits/is_function.h   |  3 +-
 .../CPP/type_traits/is_lvalue_reference.h     |  3 +-
 .../__support/CPP/type_traits/is_reference.h  |  3 +-
 .../CPP/type_traits/is_rvalue_reference.h     |  3 +-
 .../CPP/type_traits/is_trivially_copyable.h   |  1 -
 .../type_traits/is_trivially_destructible.h   |  5 +-
 .../CPP/type_traits/remove_all_extents.h      |  3 +-
 libc/src/__support/FPUtil/CMakeLists.txt      |  1 -
 libc/src/__support/FPUtil/FEnvImpl.h          |  1 -
 libc/src/__support/FPUtil/gpu/FMA.h           |  8 +-
 libc/src/__support/macros/config.h            | 18 -----
 libc/src/__support/macros/optimization.h      |  1 -
 libc/src/__support/macros/sanitizer.h         |  3 +-
 libc/src/__support/math_extras.h              |  9 +--
 libc/src/__support/memory_size.h              |  2 +-
 .../src/string/memory_utils/generic/builtin.h |  8 +-
 libc/src/string/memory_utils/utils.h          |  5 +-
 libc/utils/gpu/server/rpc_server.cpp          |  5 ++
 .../llvm-project-overlay/libc/BUILD.bazel     |  4 -
 26 files changed, 77 insertions(+), 107 deletions(-)

diff --git a/libc/docs/dev/code_style.rst b/libc/docs/dev/code_style.rst
index e6fc6df5a0f6b3..22a18b7a4cc1dd 100644
--- a/libc/docs/dev/code_style.rst
+++ b/libc/docs/dev/code_style.rst
@@ -55,7 +55,7 @@ We define two kinds of macros:
    * ``src/__support/macros/config.h`` - Important compiler and platform
      features. Such macros can be used to produce portable code by
      parameterizing compilation based on the presence or lack of a given
-     feature. e.g., ``LIBC_HAS_BUILTIN``
+     feature. e.g., ``LIBC_HAS_FEATURE``
    * ``src/__support/macros/attributes.h`` - Attributes for functions, types,
      and variables. e.g., ``LIBC_UNUSED``
    * ``src/__support/macros/optimization.h`` - Portable macros for performance
diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt
index f76285be521945..84d01fe0451602 100644
--- a/libc/src/__support/CPP/CMakeLists.txt
+++ b/libc/src/__support/CPP/CMakeLists.txt
@@ -18,7 +18,6 @@ add_header_library(
     .limits
     .type_traits
     libc.src.__support.macros.attributes
-    libc.src.__support.macros.config
     libc.src.__support.macros.sanitizer
 )
 
@@ -157,7 +156,6 @@ add_header_library(
   DEPENDS
     libc.include.llvm-libc-macros.stdfix_macros
     libc.src.__support.macros.attributes
-    libc.src.__support.macros.config
     libc.src.__support.macros.properties.types
 )
 
diff --git a/libc/src/__support/CPP/atomic.h b/libc/src/__support/CPP/atomic.h
index b74cb5981dbaf8..5e428940565b99 100644
--- a/libc/src/__support/CPP/atomic.h
+++ b/libc/src/__support/CPP/atomic.h
@@ -71,10 +71,11 @@ template <typename T> struct Atomic {
 
   T load(MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
          [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_load_n))
-      return __scoped_atomic_load_n(&val, int(mem_ord), (int)(mem_scope));
-    else
-      return __atomic_load_n(&val, int(mem_ord));
+#if __has_builtin(__scoped_atomic_load_n)
+    return __scoped_atomic_load_n(&val, int(mem_ord), (int)(mem_scope));
+#else
+    return __atomic_load_n(&val, int(mem_ord));
+#endif
   }
 
   // Atomic store.
@@ -85,10 +86,11 @@ template <typename T> struct Atomic {
 
   void store(T rhs, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_store_n))
-      __scoped_atomic_store_n(&val, rhs, int(mem_ord), (int)(mem_scope));
-    else
-      __atomic_store_n(&val, rhs, int(mem_ord));
+#if __has_builtin(__scoped_atomic_store_n)
+    __scoped_atomic_store_n(&val, rhs, int(mem_ord), (int)(mem_scope));
+#else
+    __atomic_store_n(&val, rhs, int(mem_ord));
+#endif
   }
 
   // Atomic compare exchange
@@ -101,47 +103,51 @@ template <typename T> struct Atomic {
 
   T exchange(T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_exchange_n))
-      return __scoped_atomic_exchange_n(&val, desired, int(mem_ord),
-                                        (int)(mem_scope));
-    else
-      return __atomic_exchange_n(&val, desired, int(mem_ord));
+#if __has_builtin(__scoped_atomic_exchange_n)
+    return __scoped_atomic_exchange_n(&val, desired, int(mem_ord),
+                                      (int)(mem_scope));
+#else
+    return __atomic_exchange_n(&val, desired, int(mem_ord));
+#endif
   }
 
   T fetch_add(T increment, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
               [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_add))
-      return __scoped_atomic_fetch_add(&val, increment, int(mem_ord),
-                                       (int)(mem_scope));
-    else
-      return __atomic_fetch_add(&val, increment, int(mem_ord));
+#if __has_builtin(__scoped_atomic_fetch_add)
+    return __scoped_atomic_fetch_add(&val, increment, int(mem_ord),
+                                     (int)(mem_scope));
+#else
+    return __atomic_fetch_add(&val, increment, int(mem_ord));
+#endif
   }
 
   T fetch_or(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_or))
-      return __scoped_atomic_fetch_or(&val, mask, int(mem_ord),
-                                      (int)(mem_scope));
-    else
-      return __atomic_fetch_or(&val, mask, int(mem_ord));
+#if __has_builtin(__scoped_atomic_fetch_or)
+    return __scoped_atomic_fetch_or(&val, mask, int(mem_ord), (int)(mem_scope));
+#else
+    return __atomic_fetch_or(&val, mask, int(mem_ord));
+#endif
   }
 
   T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
               [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_and))
-      return __scoped_atomic_fetch_and(&val, mask, int(mem_ord),
-                                       (int)(mem_scope));
-    else
-      return __atomic_fetch_and(&val, mask, int(mem_ord));
+#if __has_builtin(__scoped_atomic_fetch_and)
+    return __scoped_atomic_fetch_and(&val, mask, int(mem_ord),
+                                     (int)(mem_scope));
+#else
+    return __atomic_fetch_and(&val, mask, int(mem_ord));
+#endif
   }
 
   T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
               [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
-    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_sub))
-      return __scoped_atomic_fetch_sub(&val, decrement, int(mem_ord),
-                                       (int)(mem_scope));
-    else
-      return __atomic_fetch_sub(&val, decrement, int(mem_ord));
+#if __has_builtin(__scoped_atomic_fetch_sub)
+    return __scoped_atomic_fetch_sub(&val, decrement, int(mem_ord),
+                                     (int)(mem_scope));
+#else
+    return __atomic_fetch_sub(&val, decrement, int(mem_ord));
+#endif
   }
 
   // Set the value without using an atomic operation. This is useful
@@ -166,7 +172,7 @@ LIBC_INLINE void atomic_thread_fence([[maybe_unused]] MemoryOrder mem_ord) {
 // except no instructions for memory ordering are issued. Only reordering of
 // the instructions by the compiler is suppressed as order instructs.
 LIBC_INLINE void atomic_signal_fence([[maybe_unused]] MemoryOrder mem_ord) {
-#if LIBC_HAS_BUILTIN(__atomic_signal_fence)
+#if __has_builtin(__atomic_signal_fence)
   __atomic_signal_fence(static_cast<int>(mem_ord));
 #else
   // if the builtin is not ready, use asm as a full compiler barrier.
diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
index 526c499adc374c..80f50fd221efa7 100644
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -14,14 +14,13 @@
 #include "src/__support/CPP/limits.h" // numeric_limits
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN
 #include "src/__support/macros/sanitizer.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE::cpp {
 
-#if LIBC_HAS_BUILTIN(__builtin_memcpy_inline)
+#if __has_builtin(__builtin_memcpy_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif
 
@@ -36,20 +35,20 @@ LIBC_INLINE constexpr cpp::enable_if_t<
     To>
 bit_cast(const From &from) {
   MSAN_UNPOISON(&from, sizeof(From));
-#if LIBC_HAS_BUILTIN(__builtin_bit_cast)
+#if __has_builtin(__builtin_bit_cast)
   return __builtin_bit_cast(To, from);
 #else
   To to;
   char *dst = reinterpret_cast<char *>(&to);
   const char *src = reinterpret_cast<const char *>(&from);
-#if LIBC_HAS_BUILTIN(__builtin_memcpy_inline)
+#if __has_builtin(__builtin_memcpy_inline)
   __builtin_memcpy_inline(dst, src, sizeof(To));
 #else
   for (unsigned i = 0; i < sizeof(To); ++i)
     dst[i] = src[i];
-#endif // LIBC_HAS_BUILTIN(__builtin_memcpy_inline)
+#endif // __has_builtin(__builtin_memcpy_inline)
   return to;
-#endif // LIBC_HAS_BUILTIN(__builtin_bit_cast)
+#endif // __has_builtin(__builtin_bit_cast)
 }
 
 template <typename T>
@@ -94,7 +93,7 @@ countr_zero(T value) {
   }
   return zero_bits;
 }
-#if LIBC_HAS_BUILTIN(__builtin_ctzs)
+#if __has_builtin(__builtin_ctzs)
 ADD_SPECIALIZATION(countr_zero, unsigned short, __builtin_ctzs)
 #endif
 ADD_SPECIALIZATION(countr_zero, unsigned int, __builtin_ctz)
@@ -124,7 +123,7 @@ countl_zero(T value) {
   }
   return zero_bits;
 }
-#if LIBC_HAS_BUILTIN(__builtin_clzs)
+#if __has_builtin(__builtin_clzs)
 ADD_SPECIALIZATION(countl_zero, unsigned short, __builtin_clzs)
 #endif
 ADD_SPECIALIZATION(countl_zero, unsigned int, __builtin_clz)
diff --git a/libc/src/__support/CPP/type_traits/add_pointer.h b/libc/src/__support/CPP/type_traits/add_pointer.h
index 72a764bb8ba60c..1257033ee80e2c 100644
--- a/libc/src/__support/CPP/type_traits/add_pointer.h
+++ b/libc/src/__support/CPP/type_traits/add_pointer.h
@@ -10,7 +10,6 @@
 
 #include "src/__support/CPP/type_traits/remove_reference.h"
 #include "src/__support/CPP/type_traits/type_identity.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/CPP/type_traits/decay.h b/libc/src/__support/CPP/type_traits/decay.h
index a018286fddd8ab..f1a1200ab2ba1d 100644
--- a/libc/src/__support/CPP/type_traits/decay.h
+++ b/libc/src/__support/CPP/type_traits/decay.h
@@ -9,7 +9,6 @@
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_DECAY_H
 
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 #include "src/__support/CPP/type_traits/add_pointer.h"
 #include "src/__support/CPP/type_traits/conditional.h"
diff --git a/libc/src/__support/CPP/type_traits/is_destructible.h b/libc/src/__support/CPP/type_traits/is_destructible.h
index d47de1cc797b29..f94fe309ac8f74 100644
--- a/libc/src/__support/CPP/type_traits/is_destructible.h
+++ b/libc/src/__support/CPP/type_traits/is_destructible.h
@@ -16,12 +16,11 @@
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/CPP/type_traits/type_identity.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_destructible
-#if LIBC_HAS_BUILTIN(__is_destructible)
+#if __has_builtin(__is_destructible)
 template <typename T>
 struct is_destructible : bool_constant<__is_destructible(T)> {};
 #else
diff --git a/libc/src/__support/CPP/type_traits/is_function.h b/libc/src/__support/CPP/type_traits/is_function.h
index 557b3224484bca..0eba5860ad607a 100644
--- a/libc/src/__support/CPP/type_traits/is_function.h
+++ b/libc/src/__support/CPP/type_traits/is_function.h
@@ -12,12 +12,11 @@
 #include "src/__support/CPP/type_traits/is_const.h"
 #include "src/__support/CPP/type_traits/is_reference.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_function
-#if LIBC_HAS_BUILTIN(__is_function)
+#if __has_builtin(__is_function)
 template <typename T>
 struct is_function : integral_constant<bool, __is_function(T)> {};
 #else
diff --git a/libc/src/__support/CPP/type_traits/is_lvalue_reference.h b/libc/src/__support/CPP/type_traits/is_lvalue_reference.h
index f52e303afad2a5..1dff57f186a3a3 100644
--- a/libc/src/__support/CPP/type_traits/is_lvalue_reference.h
+++ b/libc/src/__support/CPP/type_traits/is_lvalue_reference.h
@@ -12,12 +12,11 @@
 #include "src/__support/CPP/type_traits/false_type.h"
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_lvalue_reference
-#if LIBC_HAS_BUILTIN(__is_lvalue_reference)
+#if __has_builtin(__is_lvalue_reference)
 template <typename T>
 struct is_lvalue_reference : bool_constant<__is_lvalue_reference(T)> {};
 #else
diff --git a/libc/src/__support/CPP/type_traits/is_reference.h b/libc/src/__support/CPP/type_traits/is_reference.h
index c017028edf411f..bbfb2b7359c3e1 100644
--- a/libc/src/__support/CPP/type_traits/is_reference.h
+++ b/libc/src/__support/CPP/type_traits/is_reference.h
@@ -12,12 +12,11 @@
 #include "src/__support/CPP/type_traits/false_type.h"
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_reference
-#if LIBC_HAS_BUILTIN(__is_reference)
+#if __has_builtin(__is_reference)
 template <typename T> struct is_reference : bool_constant<__is_reference(T)> {};
 #else
 template <typename T> struct is_reference : public false_type {};
diff --git a/libc/src/__support/CPP/type_traits/is_rvalue_reference.h b/libc/src/__support/CPP/type_traits/is_rvalue_reference.h
index f0487e41c998fe..3efbbe6b033a0f 100644
--- a/libc/src/__support/CPP/type_traits/is_rvalue_reference.h
+++ b/libc/src/__support/CPP/type_traits/is_rvalue_reference.h
@@ -12,12 +12,11 @@
 #include "src/__support/CPP/type_traits/false_type.h"
 #include "src/__support/CPP/type_traits/true_type.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_rvalue_reference
-#if LIBC_HAS_BUILTIN(__is_rvalue_reference)
+#if __has_builtin(__is_rvalue_reference)
 template <typename T>
 struct is_rvalue_reference : bool_constant<__is_rvalue_reference(T)> {};
 #else
diff --git a/libc/src/__support/CPP/type_traits/is_trivially_copyable.h b/libc/src/__support/CPP/type_traits/is_trivially_copyable.h
index 0c3fdcc711d57f..b4c825d579619c 100644
--- a/libc/src/__support/CPP/type_traits/is_trivially_copyable.h
+++ b/libc/src/__support/CPP/type_traits/is_trivially_copyable.h
@@ -9,7 +9,6 @@
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H
 
 #include "src/__support/CPP/type_traits/integral_constant.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/CPP/type_traits/is_trivially_destructible.h b/libc/src/__support/CPP/type_traits/is_trivially_destructible.h
index 3345149433afc4..37e0e869266e1d 100644
--- a/libc/src/__support/CPP/type_traits/is_trivially_destructible.h
+++ b/libc/src/__support/CPP/type_traits/is_trivially_destructible.h
@@ -11,12 +11,11 @@
 #include "src/__support/CPP/type_traits/bool_constant.h"
 #include "src/__support/CPP/type_traits/is_destructible.h"
 #include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
 // is_trivially_destructible
-#if LIBC_HAS_BUILTIN(__is_trivially_destructible)
+#if __has_builtin(__is_trivially_destructible)
 template <typename T>
 struct is_trivially_destructible
     : public bool_constant<__is_trivially_destructible(T)> {};
@@ -25,7 +24,7 @@ template <typename T>
 struct is_trivially_destructible
     : public bool_constant<cpp::is_destructible_v<T> &&__has_trivial_destructor(
           T)> {};
-#endif // LIBC_HAS_BUILTIN(__is_trivially_destructible)
+#endif // __has_builtin(__is_trivially_destructible)
 template <typename T>
 LIBC_INLINE_VAR constexpr bool is_trivially_destructible_v =
     is_trivially_destructible<T>::value;
diff --git a/libc/src/__support/CPP/type_traits/remove_all_extents.h b/libc/src/__support/CPP/type_traits/remove_all_extents.h
index bff6341d3e4560..5941b82bbc1619 100644
--- a/libc/src/__support/CPP/type_traits/remove_all_extents.h
+++ b/libc/src/__support/CPP/type_traits/remove_all_extents.h
@@ -9,14 +9,13 @@
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_REMOVE_ALL_EXTENTS_H
 
 #include "src/__support/CPP/type_traits/type_identity.h"
-#include "src/__support/macros/config.h"
 
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE::cpp {
 
 // remove_all_extents
-#if LIBC_HAS_BUILTIN(__remove_all_extents)
+#if __has_builtin(__remove_all_extents)
 template <typename T> using remove_all_extents_t = __remove_all_extents(T);
 template <typename T>
 struct remove_all_extents : cpp::type_identity<remove_all_extents_t<T>> {};
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 0f435023419757..ff155a19758d20 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -6,7 +6,6 @@ add_header_library(
     libc.include.fenv
     libc.include.math
     libc.src.__support.macros.attributes
-    libc.src.__support.macros.config
     libc.src.errno.errno
 )
 
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index a6a533dcfdf4aa..6086d5d3de2dca 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -11,7 +11,6 @@
 
 #include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 #include "src/__support/macros/properties/architectures.h"
 #include "src/errno/libc_errno.h"
 #include <fenv.h>
diff --git a/libc/src/__support/FPUtil/gpu/FMA.h b/libc/src/__support/FPUtil/gpu/FMA.h
index 86bc8603149611..ef1cd26a72dd7c 100644
--- a/libc/src/__support/FPUtil/gpu/FMA.h
+++ b/libc/src/__support/FPUtil/gpu/FMA.h
@@ -10,12 +10,12 @@
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GPU_FMA_H
 
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/config.h"
 
-// These intrinsics map to the FMA instrunctions in the target ISA for the GPU.
+// These intrinsics map to the FMA instructions in the target ISA for the GPU.
 // The default rounding mode generated from these will be to the nearest even.
-static_assert(LIBC_HAS_BUILTIN(__builtin_fma), "FMA builtins must be defined");
-static_assert(LIBC_HAS_BUILTIN(__builtin_fmaf), "FMA builtins must be defined");
+#if !__has_builtin(__builtin_fma) || !__has_builtin(__builtin_fmaf)
+#error "FMA builtins must be defined");
+#endif
 
 namespace LIBC_NAMESPACE {
 namespace fputil {
diff --git a/libc/src/__support/macros/config.h b/libc/src/__support/macros/config.h
index 6666c136669610..3f200f0d62ba26 100644
--- a/libc/src/__support/macros/config.h
+++ b/libc/src/__support/macros/config.h
@@ -13,24 +13,6 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_CONFIG_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_CONFIG_H
 
-// LIBC_HAS_BUILTIN()
-//
-// Checks whether the compiler supports a Clang Feature Checking Macro, and if
-// so, checks whether it supports the provided builtin function "x" where x
-// is one of the functions noted in
-// https://clang.llvm.org/docs/LanguageExtensions.html
-//
-// Note: Use this macro to avoid an extra level of #ifdef __has_builtin check.
-// http://releases.llvm.org/3.3/tools/clang/docs/LanguageExtensions.html
-
-// Compiler builtin-detection.
-// clang.llvm.org/docs/LanguageExtensions.html#has-builtin
-#ifdef __has_builtin
-#define LIBC_HAS_BUILTIN(x) __has_builtin(x)
-#else
-#define LIBC_HAS_BUILTIN(x) 0
-#endif
-
 // Compiler feature-detection.
 // clang.llvm.org/docs/LanguageExtensions.html#has-feature-and-has-extension
 #ifdef __has_feature
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index ae97efcaa41706..59886ca44be12a 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -11,7 +11,6 @@
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
 
 #include "src/__support/macros/attributes.h"          // LIBC_INLINE
-#include "src/__support/macros/config.h"              // LIBC_HAS_BUILTIN
 #include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG
 
 // We use a template to implement likely/unlikely to make sure that we don't
diff --git a/libc/src/__support/macros/sanitizer.h b/libc/src/__support/macros/sanitizer.h
index fc66c2005c42de..bd9b62b7121a14 100644
--- a/libc/src/__support/macros/sanitizer.h
+++ b/libc/src/__support/macros/sanitizer.h
@@ -47,8 +47,7 @@
 // Functions to unpoison memory
 //-----------------------------------------------------------------------------
 
-#if defined(LIBC_HAVE_MEMORY_SANITIZER) &&                                     \
-    LIBC_HAS_BUILTIN(__builtin_constant_p)
+#if defined(LIBC_HAVE_MEMORY_SANITIZER) && __has_builtin(__builtin_constant_p)
 // Only perform MSAN unpoison in non-constexpr context.
 #include <sanitizer/msan_interface.h>
 #define MSAN_UNPOISON(addr, size)                                              \
diff --git a/libc/src/__support/math_extras.h b/libc/src/__support/math_extras.h
index 28ee1be8b99997..70a8800b285d02 100644
--- a/libc/src/__support/math_extras.h
+++ b/libc/src/__support/math_extras.h
@@ -14,7 +14,6 @@
 #include "src/__support/CPP/limits.h"        // CHAR_BIT, numeric_limits
 #include "src/__support/CPP/type_traits.h"   // is_unsigned_v
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 
 namespace LIBC_NAMESPACE {
 
@@ -61,7 +60,7 @@ add_with_carry(T a, T b, T carry_in) {
   return add_with_carry_const<T>(a, b, carry_in);
 }
 
-#if LIBC_HAS_BUILTIN(__builtin_addc)
+#if __has_builtin(__builtin_addc)
 // https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
 
 template <>
@@ -129,7 +128,7 @@ add_with_carry<unsigned long long>(unsigned long long a, unsigned long long b,
   }
 }
 
-#endif // LIBC_HAS_BUILTIN(__builtin_addc)
+#endif // __has_builtin(__builtin_addc)
 
 // Subtract with borrow
 template <typename T> struct DiffBorrow {
@@ -157,7 +156,7 @@ sub_with_borrow(T a, T b, T borrow_in) {
   return sub_with_borrow_const<T>(a, b, borrow_in);
 }
 
-#if LIBC_HAS_BUILTIN(__builtin_subc)
+#if __has_builtin(__builtin_subc)
 // https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
 
 template <>
@@ -225,7 +224,7 @@ sub_with_borrow<unsigned long long>(unsigned long long a, unsigned long long b,
   }
 }
 
-#endif // LIBC_HAS_BUILTIN(__builtin_subc)
+#endif // __has_builtin(__builtin_subc)
 
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
diff --git a/libc/src/__support/memory_size.h b/libc/src/__support/memory_size.h
index 7bd16a1695be9a..491123bbabf308 100644
--- a/libc/src/__support/memory_size.h
+++ b/libc/src/__support/memory_size.h
@@ -19,7 +19,7 @@
 namespace LIBC_NAMESPACE {
 namespace internal {
 template <class T> LIBC_INLINE bool mul_overflow(T a, T b, T *res) {
-#if LIBC_HAS_BUILTIN(__builtin_mul_overflow)
+#if __has_builtin(__builtin_mul_overflow)
   return __builtin_mul_overflow(a, b, res);
 #else
   T max = cpp::numeric_limits<T>::max();
diff --git a/libc/src/string/memory_utils/generic/builtin.h b/libc/src/string/memory_utils/generic/builtin.h
index 5239329f653b34..ba4f4b89840889 100644
--- a/libc/src/string/memory_utils/generic/builtin.h
+++ b/libc/src/string/memory_utils/generic/builtin.h
@@ -10,16 +10,16 @@
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_BUILTIN_H
 
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 #include "src/string/memory_utils/utils.h"   // Ptr, CPtr
 
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE {
 
-static_assert(LIBC_HAS_BUILTIN(__builtin_memcpy), "Builtin not defined");
-static_assert(LIBC_HAS_BUILTIN(__builtin_memset), "Builtin not defined");
-static_assert(LIBC_HAS_BUILTIN(__builtin_memmove), "Builtin not defined");
+#if !__has_builtin(__builtin_memcpy) || !__has_builtin(__builtin_memset) ||    \
+    !__has_builtin(__builtin_memmove)
+#error "Builtin not defined");
+#endif
 
 [[maybe_unused]] LIBC_INLINE void
 inline_memcpy_builtin(Ptr dst, CPtr src, size_t count, size_t offset = 0) {
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 79526d19c6b3dc..b3e1a26ad99610 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -14,7 +14,6 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/endian.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
-#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 #include "src/__support/macros/properties/architectures.h"
 
 #include <stddef.h> // size_t
@@ -71,11 +70,11 @@ LIBC_INLINE bool is_disjoint(const void *p1, const void *p2, size_t size) {
   return sdiff >= 0 ? size <= udiff : size <= neg_udiff;
 }
 
-#if LIBC_HAS_BUILTIN(__builtin_memcpy_inline)
+#if __has_builtin(__builtin_memcpy_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif
 
-#if LIBC_HAS_BUILTIN(__builtin_memset_inline)
+#if __has_builtin(__builtin_memset_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
 #endif
 
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 90af1569c4c53b..46ad98fa02cc51 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Workaround for missing __has_builtin in < GCC 10.
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
 #include "llvmlibc_rpc_server.h"
 
 #include "src/__support/RPC/rpc.h"
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e0790e0ab59efa..eb0afbb6dd6ffe 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -216,7 +216,6 @@ libc_support_library(
         ":__support_cpp_limits",
         ":__support_cpp_type_traits",
         ":__support_macros_attributes",
-        ":__support_macros_config",
         ":__support_macros_sanitizer",
     ],
 )
@@ -383,7 +382,6 @@ libc_support_library(
     ],
     deps = [
         ":__support_macros_attributes",
-        ":__support_macros_config",
         ":__support_macros_properties_types",
         ":llvm_libc_macros_stdfix_macros",
     ],
@@ -663,7 +661,6 @@ libc_support_library(
         ":__support_cpp_limits",
         ":__support_cpp_type_traits",
         ":__support_macros_attributes",
-        ":__support_macros_config",
     ],
 )
 
@@ -2318,7 +2315,6 @@ libc_support_library(
         ":__support_cpp_cstddef",
         ":__support_cpp_type_traits",
         ":__support_macros_attributes",
-        ":__support_macros_config",
         ":__support_macros_optimization",
         ":__support_macros_properties_architectures",
         ":__support_macros_properties_cpu_features",

>From 6a0ec8e25cba9d398cf525889c53835cf40247a3 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Wed, 27 Mar 2024 09:25:46 -0700
Subject: [PATCH 41/54] [lldb] Revive shell test after updating UnwindTable
 (#86770)

In
     commit 2f63718f8567413a1c596bda803663eb58d6da5a
     Author: Jason Molenda <jmolenda at apple.com>
     Date:   Tue Mar 26 09:07:15 2024 -0700

[lldb] Don't clear a Module's UnwindTable when adding a SymbolFile
(#86603)

I stopped clearing a Module's UnwindTable when we add a SymbolFile to
avoid the memory management problems with adding a symbol file
asynchronously while the UnwindTable is being accessed on another
thread. This broke the target-symbols-add-unwind.test shell test on
Linux which removes the DWARF debub_frame section from a binary, loads
it, then loads the unstripped binary with the DWARF debug_frame section
and checks that the UnwindPlans for a function include debug_frame.

I originally decided that I was willing to sacrifice the possiblity of
additional unwind sources from a symbol file because we rely on assembly
emulation so heavily, they're rarely critical. But there are targets
where we we don't have emluation and rely on things like DWARF
debug_frame a lot more, so this probably wasn't a good choice.

This patch adds a new UnwindTable::Update method which looks for any new
sources of unwind information and adds it to the UnwindTable, and calls
that after a new SymbolFile has been added to a Module.
---
 lldb/include/lldb/Symbol/UnwindTable.h        |  4 ++
 lldb/source/Core/Module.cpp                   |  2 +
 lldb/source/Symbol/UnwindTable.cpp            | 45 +++++++++++++++++++
 .../SymbolFile/target-symbols-add-unwind.test | 27 +++++++++++
 4 files changed, 78 insertions(+)
 create mode 100644 lldb/test/Shell/SymbolFile/target-symbols-add-unwind.test

diff --git a/lldb/include/lldb/Symbol/UnwindTable.h b/lldb/include/lldb/Symbol/UnwindTable.h
index f0ce7047de2d1e..26826e5d1b497c 100644
--- a/lldb/include/lldb/Symbol/UnwindTable.h
+++ b/lldb/include/lldb/Symbol/UnwindTable.h
@@ -57,6 +57,10 @@ class UnwindTable {
 
   ArchSpec GetArchitecture();
 
+  /// Called after a SymbolFile has been added to a Module to add any new
+  /// unwind sections that may now be available.
+  void Update();
+
 private:
   void Dump(Stream &s);
 
diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp
index a520523a96521a..9c105b3f0e57a1 100644
--- a/lldb/source/Core/Module.cpp
+++ b/lldb/source/Core/Module.cpp
@@ -1009,6 +1009,8 @@ SymbolFile *Module::GetSymbolFile(bool can_create, Stream *feedback_strm) {
         m_symfile_up.reset(
             SymbolVendor::FindPlugin(shared_from_this(), feedback_strm));
         m_did_load_symfile = true;
+        if (m_unwind_table)
+          m_unwind_table->Update();
       }
     }
   }
diff --git a/lldb/source/Symbol/UnwindTable.cpp b/lldb/source/Symbol/UnwindTable.cpp
index 3c1a5187b11054..11bedf3d6052e7 100644
--- a/lldb/source/Symbol/UnwindTable.cpp
+++ b/lldb/source/Symbol/UnwindTable.cpp
@@ -84,6 +84,51 @@ void UnwindTable::Initialize() {
   }
 }
 
+void UnwindTable::Update() {
+  if (!m_initialized)
+    return Initialize();
+
+  std::lock_guard<std::mutex> guard(m_mutex);
+
+  ObjectFile *object_file = m_module.GetObjectFile();
+  if (!object_file)
+    return;
+
+  if (!m_object_file_unwind_up)
+    m_object_file_unwind_up = object_file->CreateCallFrameInfo();
+
+  SectionList *sl = m_module.GetSectionList();
+  if (!sl)
+    return;
+
+  SectionSP sect = sl->FindSectionByType(eSectionTypeEHFrame, true);
+  if (!m_eh_frame_up && sect) {
+    m_eh_frame_up = std::make_unique<DWARFCallFrameInfo>(
+        *object_file, sect, DWARFCallFrameInfo::EH);
+  }
+
+  sect = sl->FindSectionByType(eSectionTypeDWARFDebugFrame, true);
+  if (!m_debug_frame_up && sect) {
+    m_debug_frame_up = std::make_unique<DWARFCallFrameInfo>(
+        *object_file, sect, DWARFCallFrameInfo::DWARF);
+  }
+
+  sect = sl->FindSectionByType(eSectionTypeCompactUnwind, true);
+  if (!m_compact_unwind_up && sect) {
+    m_compact_unwind_up =
+        std::make_unique<CompactUnwindInfo>(*object_file, sect);
+  }
+
+  sect = sl->FindSectionByType(eSectionTypeARMexidx, true);
+  if (!m_arm_unwind_up && sect) {
+    SectionSP sect_extab = sl->FindSectionByType(eSectionTypeARMextab, true);
+    if (sect_extab.get()) {
+      m_arm_unwind_up =
+          std::make_unique<ArmUnwindInfo>(*object_file, sect, sect_extab);
+    }
+  }
+}
+
 UnwindTable::~UnwindTable() = default;
 
 std::optional<AddressRange>
diff --git a/lldb/test/Shell/SymbolFile/target-symbols-add-unwind.test b/lldb/test/Shell/SymbolFile/target-symbols-add-unwind.test
new file mode 100644
index 00000000000000..5420213d405e86
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/target-symbols-add-unwind.test
@@ -0,0 +1,27 @@
+# TODO: When it's possible to run "image show-unwind" without a running
+# process, we can remove the unsupported line below, and hard-code an ELF
+# triple in the test.
+# UNSUPPORTED: system-windows, system-darwin
+
+# RUN: cd %T
+# RUN: %clang_host %S/Inputs/target-symbols-add-unwind.c -g \
+# RUN:   -fno-unwind-tables -fno-asynchronous-unwind-tables \
+# RUN:   -o target-symbols-add-unwind.debug
+# RUN: llvm-objcopy --strip-debug target-symbols-add-unwind.debug \
+# RUN:   target-symbols-add-unwind.stripped
+# RUN: %lldb target-symbols-add-unwind.stripped -s %s -o quit | FileCheck %s
+
+process launch --stop-at-entry
+image show-unwind -n main
+# CHECK-LABEL: image show-unwind -n main
+# CHECK-NOT: debug_frame UnwindPlan:
+
+target symbols add -s target-symbols-add-unwind.stripped target-symbols-add-unwind.debug
+# CHECK-LABEL: target symbols add
+# CHECK: symbol file {{.*}} has been added to {{.*}}
+
+image show-unwind -n main
+# CHECK-LABEL: image show-unwind -n main
+# CHECK: debug_frame UnwindPlan:
+# CHECK-NEXT: This UnwindPlan originally sourced from DWARF CFI
+# CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes.

>From 468c6bea2280491283e45239ad1c0ac6a59b3da8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Mar 2024 16:23:46 +0000
Subject: [PATCH 42/54] Fix "result of 32-bit shift implicitly converted to 64
 bits" MSVC warning. NFCI.

---
 llvm/lib/Object/GOFFObjectFile.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Object/GOFFObjectFile.cpp b/llvm/lib/Object/GOFFObjectFile.cpp
index 6b48d464dc3ec7..2845d9362544bb 100644
--- a/llvm/lib/Object/GOFFObjectFile.cpp
+++ b/llvm/lib/Object/GOFFObjectFile.cpp
@@ -514,7 +514,7 @@ uint64_t GOFFObjectFile::getSectionAlignment(DataRefImpl Sec) const {
   const uint8_t *EsdRecord = getSectionEdEsdRecord(Sec);
   GOFF::ESDAlignment Pow2Alignment;
   ESDRecord::getAlignment(EsdRecord, Pow2Alignment);
-  return 1 << static_cast<uint64_t>(Pow2Alignment);
+  return 1ULL << static_cast<uint64_t>(Pow2Alignment);
 }
 
 bool GOFFObjectFile::isSectionText(DataRefImpl Sec) const {

>From f92fa7e2cf38341211af262b21c568bef4d76b10 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Mar 2024 16:25:55 +0000
Subject: [PATCH 43/54] [X86] Add -verify-machineinstrs to huge stack tests

Help identify EXPENSIVE_CHECKS regressions identified in #84114
---
 llvm/test/CodeGen/X86/huge-stack-offset.ll  | 4 ++--
 llvm/test/CodeGen/X86/huge-stack-offset2.ll | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/X86/huge-stack-offset.ll b/llvm/test/CodeGen/X86/huge-stack-offset.ll
index 68dcfa748b0cb6..e825328ccd89a2 100644
--- a/llvm/test/CodeGen/X86/huge-stack-offset.ll
+++ b/llvm/test/CodeGen/X86/huge-stack-offset.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux-unknown | FileCheck %s --check-prefix=CHECK-64
-; RUN: llc < %s -mtriple=i386-linux-unknown | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -mtriple=x86_64-linux-unknown -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=i386-linux-unknown -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-32
 
 ; Test that a large stack offset uses a single add/sub instruction to
 ; adjust the stack pointer.
diff --git a/llvm/test/CodeGen/X86/huge-stack-offset2.ll b/llvm/test/CodeGen/X86/huge-stack-offset2.ll
index 3bf0260cc12ab9..053643eb3686c5 100644
--- a/llvm/test/CodeGen/X86/huge-stack-offset2.ll
+++ b/llvm/test/CodeGen/X86/huge-stack-offset2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
 
 ; Test how we handle pathologically large stack frames when RAX is live through
 ; the prologue and epilogue.

>From 9669aba13295de5ccdefc44e22e30c0295e6afd2 Mon Sep 17 00:00:00 2001
From: AtariDreams <gfunni234 at gmail.com>
Date: Wed, 27 Mar 2024 12:33:56 -0400
Subject: [PATCH 44/54] [Thumb1] LivePhysRegs to LiveRegUnits (#84474)

This removes the r7 exception because otherwise, LiveRegUnits will try
to use that register.
---
 llvm/lib/Target/ARM/Thumb1FrameLowering.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 0f4ece64bff532..047c6731333c9b 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -612,11 +612,11 @@ bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
 
 static void findTemporariesForLR(const BitVector &GPRsNoLRSP,
                                  const BitVector &PopFriendly,
-                                 const LivePhysRegs &UsedRegs, unsigned &PopReg,
+                                 const LiveRegUnits &UsedRegs, unsigned &PopReg,
                                  unsigned &TmpReg, MachineRegisterInfo &MRI) {
   PopReg = TmpReg = 0;
   for (auto Reg : GPRsNoLRSP.set_bits()) {
-    if (UsedRegs.available(MRI, Reg)) {
+    if (UsedRegs.available(Reg)) {
       // Remember the first pop-friendly register and exit.
       if (PopFriendly.test(Reg)) {
         PopReg = Reg;
@@ -684,7 +684,7 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   // Look for a temporary register to use.
   // First, compute the liveness information.
   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-  LivePhysRegs UsedRegs(TRI);
+  LiveRegUnits UsedRegs(TRI);
   UsedRegs.addLiveOuts(MBB);
   // The semantic of pristines changed recently and now,
   // the callee-saved registers that are touched in the function
@@ -710,11 +710,6 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   unsigned TemporaryReg = 0;
   BitVector PopFriendly =
       TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::tGPRRegClassID));
-  // R7 may be used as a frame pointer, hence marked as not generally
-  // allocatable, however there's no reason to not use it as a temporary for
-  // restoring LR.
-  if (STI.getFramePointerReg() == ARM::R7)
-    PopFriendly.set(ARM::R7);
 
   assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
   // Rebuild the GPRs from the high registers because they are removed

>From 6ad1cf3b37f0eefa5f43f90990ec3dcf5c87dead Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida at apple.com>
Date: Wed, 27 Mar 2024 12:34:21 -0400
Subject: [PATCH 45/54] [InstallAPI] Add *umbrella-header options (#86587)

Umbrella headers are a concept for Darwin-based libraries. They allow
framework authors to control the order in which their headers should be
parsed and allow clients to access available headers by including a
single header.

InstallAPI will attempt to find the umbrella based on the name of the
framework. Users can also specify this explicitly by using command line
options specifying the umbrella header by file path. There can be an
umbrella header per access level.
---
 .../clang/Basic/DiagnosticInstallAPIKinds.td  |  1 +
 clang/include/clang/InstallAPI/HeaderFile.h   | 16 ++--
 .../Umbrella/Umbrella.framework/Headers/AAA.h |  3 +
 .../Headers/SpecialUmbrella.h                 |  1 +
 .../PrivateHeaders/AAA_Private.h              |  3 +
 .../PrivateHeaders/SpecialPrivateUmbrella.h   |  1 +
 .../InstallAPI/umbrella-headers-unix.test     | 40 ++++++++++
 clang/test/InstallAPI/umbrella-headers.test   | 48 +++++++++++
 .../tools/clang-installapi/InstallAPIOpts.td  | 12 +++
 clang/tools/clang-installapi/Options.cpp      | 79 ++++++++++++++++++-
 clang/tools/clang-installapi/Options.h        |  9 +++
 11 files changed, 206 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/AAA.h
 create mode 100644 clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/SpecialUmbrella.h
 create mode 100644 clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/AAA_Private.h
 create mode 100644 clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h
 create mode 100644 clang/test/InstallAPI/umbrella-headers-unix.test
 create mode 100644 clang/test/InstallAPI/umbrella-headers.test

diff --git a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
index 27df731fa28627..e3263fe9ccb9d4 100644
--- a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
+++ b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
@@ -18,6 +18,7 @@ def err_no_output_file: Error<"no output file specified">;
 def err_no_such_header_file : Error<"no such %select{public|private|project}1 header file: '%0'">;
 def warn_no_such_excluded_header_file : Warning<"no such excluded %select{public|private}0 header file: '%1'">, InGroup<InstallAPIViolation>;
 def warn_glob_did_not_match: Warning<"glob '%0' did not match any header file">, InGroup<InstallAPIViolation>;
+def err_no_such_umbrella_header_file : Error<"%select{public|private|project}1 umbrella header file not found in input: '%0'">;
 } // end of command line category.
 
 let CategoryName = "Verification" in {
diff --git a/clang/include/clang/InstallAPI/HeaderFile.h b/clang/include/clang/InstallAPI/HeaderFile.h
index 235b4da3add840..c67503d4ad49e9 100644
--- a/clang/include/clang/InstallAPI/HeaderFile.h
+++ b/clang/include/clang/InstallAPI/HeaderFile.h
@@ -24,8 +24,6 @@
 
 namespace clang::installapi {
 enum class HeaderType {
-  /// Unset or unknown type.
-  Unknown,
   /// Represents declarations accessible to all clients.
   Public,
   /// Represents declarations accessible to a disclosed set of clients.
@@ -33,6 +31,8 @@ enum class HeaderType {
   /// Represents declarations only accessible as implementation details to the
   /// input library.
   Project,
+  /// Unset or unknown type.
+  Unknown,
 };
 
 inline StringRef getName(const HeaderType T) {
@@ -62,6 +62,8 @@ class HeaderFile {
   bool Excluded{false};
   /// Add header file to processing.
   bool Extra{false};
+  /// Specify that header file is the umbrella header for library.
+  bool Umbrella{false};
 
 public:
   HeaderFile() = delete;
@@ -79,17 +81,21 @@ class HeaderFile {
 
   void setExtra(bool V = true) { Extra = V; }
   void setExcluded(bool V = true) { Excluded = V; }
+  void setUmbrellaHeader(bool V = true) { Umbrella = V; }
   bool isExtra() const { return Extra; }
   bool isExcluded() const { return Excluded; }
+  bool isUmbrellaHeader() const { return Umbrella; }
 
   bool useIncludeName() const {
     return Type != HeaderType::Project && !IncludeName.empty();
   }
 
   bool operator==(const HeaderFile &Other) const {
-    return std::tie(Type, FullPath, IncludeName, Language, Excluded, Extra) ==
-           std::tie(Other.Type, Other.FullPath, Other.IncludeName,
-                    Other.Language, Other.Excluded, Other.Extra);
+    return std::tie(Type, FullPath, IncludeName, Language, Excluded, Extra,
+                    Umbrella) == std::tie(Other.Type, Other.FullPath,
+                                          Other.IncludeName, Other.Language,
+                                          Other.Excluded, Other.Extra,
+                                          Other.Umbrella);
   }
 };
 
diff --git a/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/AAA.h b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/AAA.h
new file mode 100644
index 00000000000000..993d5d4abadb8e
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/AAA.h
@@ -0,0 +1,3 @@
+#ifndef PUBLIC_UMBRELLA_HEADER_FIRST
+#error "Public umbrella header was not included first!"
+#endif
diff --git a/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/SpecialUmbrella.h b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/SpecialUmbrella.h
new file mode 100644
index 00000000000000..2599ff14ae1723
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/Headers/SpecialUmbrella.h
@@ -0,0 +1 @@
+#define PUBLIC_UMBRELLA_HEADER_FIRST
diff --git a/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/AAA_Private.h b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/AAA_Private.h
new file mode 100644
index 00000000000000..557209bfeb8699
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/AAA_Private.h
@@ -0,0 +1,3 @@
+#ifndef PRIVATE_UMBRELLA_HEADER_FIRST
+#error "Private umbrella header was not included first!"
+#endif
diff --git a/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h
new file mode 100644
index 00000000000000..fd5b49b943161a
--- /dev/null
+++ b/clang/test/InstallAPI/Inputs/Umbrella/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h
@@ -0,0 +1 @@
+#define PRIVATE_UMBRELLA_HEADER_FIRST
diff --git a/clang/test/InstallAPI/umbrella-headers-unix.test b/clang/test/InstallAPI/umbrella-headers-unix.test
new file mode 100644
index 00000000000000..46118779896cf1
--- /dev/null
+++ b/clang/test/InstallAPI/umbrella-headers-unix.test
@@ -0,0 +1,40 @@
+// UNSUPPORTED: system-windows
+
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+; RUN: mkdir %t/Frameworks/
+; RUN: cp -r %S/Inputs/Umbrella/Umbrella.framework %t/Frameworks/
+
+// Only validate path based input that rely on regex matching on unix based file systems.
+; RUN: clang-installapi --target=arm64-apple-macosx13 \
+; RUN:  -install_name /System/Library/Frameworks/Umbrella2.framework/Versions/A/Umbrella \
+; RUN: -ObjC -F%t/Frameworks/ %t/inputs.json \
+; RUN: --public-umbrella-header=%t/Frameworks/Umbrella.framework/Headers/SpecialUmbrella.h \
+; RUN: -private-umbrella-header \
+; RUN: %t/Frameworks/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h \
+; RUN: -o %t/output.tbd 2>&1 | FileCheck -allow-empty %s
+
+; CHECK-NOT: error
+; CHECK-NOT: warning
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/Headers/AAA.h",
+    "type" : "public"
+  }, 
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/Headers/SpecialUmbrella.h",
+    "type" : "public"
+  },
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/PrivateHeaders/AAA_Private.h",
+    "type" : "private"
+  },
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h",
+    "type" : "private"
+  }],
+  "version": "3"
+}
diff --git a/clang/test/InstallAPI/umbrella-headers.test b/clang/test/InstallAPI/umbrella-headers.test
new file mode 100644
index 00000000000000..ce9c50608c4119
--- /dev/null
+++ b/clang/test/InstallAPI/umbrella-headers.test
@@ -0,0 +1,48 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+; RUN: cp -r %S/Inputs/Umbrella/Umbrella.framework %t/Frameworks/
+
+// Check base filename matches.
+; RUN: clang-installapi --target=arm64-apple-macosx13 \
+; RUN: -install_name /System/Library/Frameworks/Umbrella.framework/Versions/A/Umbrella \
+; RUN: -ObjC -F%t/Frameworks/ %t/inputs.json \
+; RUN: --public-umbrella-header=SpecialUmbrella.h \
+; RUN: --private-umbrella-header=SpecialPrivateUmbrella.h \
+; RUN: -o %t/output.tbd 2>&1 | FileCheck -allow-empty %s
+
+// Try missing umbrella header argument.
+; RUN: not clang-installapi --target=arm64-apple-macosx13 \
+; RUN: -install_name /System/Library/Frameworks/Umbrella.framework/Versions/A/Umbrella \
+; RUN: -ObjC -F%t/Frameworks/ %t/inputs.json \
+; RUN: --public-umbrella-header=Ignore.h \
+; RUN: -o %t/output.tbd 2>&1 | FileCheck %s -check-prefix=ERR
+
+; ERR: error: public umbrella header file not found in input: 'Ignore.h'
+
+; CHECK-NOT: error
+; CHECK-NOT: warning
+
+;--- Frameworks/Umbrella.framework/Headers/Ignore.h
+#error "This header should be ignored"
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/Headers/AAA.h",
+    "type" : "public"
+  }, 
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/Headers/SpecialUmbrella.h",
+    "type" : "public"
+  },
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/PrivateHeaders/AAA_Private.h",
+    "type" : "private"
+  },
+  {
+    "path" : "DSTROOT/Frameworks/Umbrella.framework/PrivateHeaders/SpecialPrivateUmbrella.h",
+    "type" : "private"
+  }],
+  "version": "3"
+}
diff --git a/clang/tools/clang-installapi/InstallAPIOpts.td b/clang/tools/clang-installapi/InstallAPIOpts.td
index ab9e1fe7f2f949..71532c9cf24d17 100644
--- a/clang/tools/clang-installapi/InstallAPIOpts.td
+++ b/clang/tools/clang-installapi/InstallAPIOpts.td
@@ -61,3 +61,15 @@ def exclude_private_header : Separate<["-"], "exclude-private-header">,
   HelpText<"Exclude private header from parsing">;
 def exclude_private_header_EQ : Joined<["--"], "exclude-private-header=">,
   Alias<exclude_private_header>;
+def public_umbrella_header : Separate<["-"], "public-umbrella-header">,
+  MetaVarName<"<path>">, HelpText<"Specify the public umbrella header location">;
+def public_umbrella_header_EQ : Joined<["--"], "public-umbrella-header=">,
+  Alias<public_umbrella_header>;
+def private_umbrella_header : Separate<["-"], "private-umbrella-header">,
+  MetaVarName<"<path>">, HelpText<"Specify the private umbrella header location">;
+def private_umbrella_header_EQ : Joined<["--"], "private-umbrella-header=">,
+  Alias<private_umbrella_header>;
+def project_umbrella_header : Separate<["-"], "project-umbrella-header">,
+  MetaVarName<"<path>">, HelpText<"Specify the project umbrella header location">;
+def project_umbrella_header_EQ : Joined<["--"], "project-umbrella-header=">,
+  Alias<project_umbrella_header>;
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 4f79c62724a62d..8e4a1b019fd816 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -270,6 +270,16 @@ Options::processAndFilterOutInstallAPIOptions(ArrayRef<const char *> Args) {
                                  OPT_exclude_project_header))
     return {};
 
+  // Handle umbrella headers.
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_public_umbrella_header))
+    DriverOpts.PublicUmbrellaHeader = A->getValue();
+
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_private_umbrella_header))
+    DriverOpts.PrivateUmbrellaHeader = A->getValue();
+
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_project_umbrella_header))
+    DriverOpts.ProjectUmbrellaHeader = A->getValue();
+
   /// Any unclaimed arguments should be forwarded to the clang driver.
   std::vector<const char *> ClangDriverArgs(ParsedArgs.size());
   for (const Arg *A : ParsedArgs) {
@@ -323,6 +333,15 @@ Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
   }
 }
 
+static const Regex Rule("(.+)/(.+)\\.framework/");
+static StringRef getFrameworkNameFromInstallName(StringRef InstallName) {
+  SmallVector<StringRef, 3> Match;
+  Rule.match(InstallName, &Match);
+  if (Match.empty())
+    return "";
+  return Match.back();
+}
+
 InstallAPIContext Options::createContext() {
   InstallAPIContext Ctx;
   Ctx.FM = FM;
@@ -339,6 +358,11 @@ InstallAPIContext Options::createContext() {
   Ctx.OutputLoc = DriverOpts.OutputPath;
   Ctx.LangMode = FEOpts.LangMode;
 
+  // Attempt to find umbrella headers by capturing framework name.
+  StringRef FrameworkName;
+  if (!LinkerOpts.IsDylib)
+    FrameworkName = getFrameworkNameFromInstallName(LinkerOpts.InstallName);
+
   // Process inputs.
   for (const std::string &ListPath : DriverOpts.FileLists) {
     auto Buffer = FM->getBufferForFile(ListPath);
@@ -357,8 +381,7 @@ InstallAPIContext Options::createContext() {
     assert(Type != HeaderType::Unknown && "Missing header type.");
     for (const StringRef Path : Headers) {
       if (!FM->getOptionalFileRef(Path)) {
-        Diags->Report(diag::err_no_such_header_file)
-            << Path << (unsigned)Type - 1;
+        Diags->Report(diag::err_no_such_header_file) << Path << (unsigned)Type;
         return false;
       }
       SmallString<PATH_MAX> FullPath(Path);
@@ -382,6 +405,7 @@ InstallAPIContext Options::createContext() {
   std::vector<std::unique_ptr<HeaderGlob>> ExcludedHeaderGlobs;
   std::set<FileEntryRef> ExcludedHeaderFiles;
   auto ParseGlobs = [&](const PathSeq &Paths, HeaderType Type) {
+    assert(Type != HeaderType::Unknown && "Missing header type.");
     for (const StringRef Path : Paths) {
       auto Glob = HeaderGlob::create(Path, Type);
       if (Glob)
@@ -424,6 +448,57 @@ InstallAPIContext Options::createContext() {
     if (!Glob->didMatch())
       Diags->Report(diag::warn_glob_did_not_match) << Glob->str();
 
+  // Mark any explicit or inferred umbrella headers. If one exists, move
+  // that to the beginning of the input headers.
+  auto MarkandMoveUmbrellaInHeaders = [&](llvm::Regex &Regex,
+                                          HeaderType Type) -> bool {
+    auto It = find_if(Ctx.InputHeaders, [&Regex, Type](const HeaderFile &H) {
+      return (H.getType() == Type) && Regex.match(H.getPath());
+    });
+
+    if (It == Ctx.InputHeaders.end())
+      return false;
+    It->setUmbrellaHeader();
+
+    // Because there can be an umbrella header per header type,
+    // find the first non umbrella header to swap position with.
+    auto BeginPos = find_if(Ctx.InputHeaders, [](const HeaderFile &H) {
+      return !H.isUmbrellaHeader();
+    });
+    if (BeginPos != Ctx.InputHeaders.end() && BeginPos < It)
+      std::swap(*BeginPos, *It);
+    return true;
+  };
+
+  auto FindUmbrellaHeader = [&](StringRef HeaderPath, HeaderType Type) -> bool {
+    assert(Type != HeaderType::Unknown && "Missing header type.");
+    if (!HeaderPath.empty()) {
+      auto EscapedString = Regex::escape(HeaderPath);
+      Regex UmbrellaRegex(EscapedString);
+      if (!MarkandMoveUmbrellaInHeaders(UmbrellaRegex, Type)) {
+        Diags->Report(diag::err_no_such_umbrella_header_file)
+            << HeaderPath << (unsigned)Type;
+        return false;
+      }
+    } else if (!FrameworkName.empty() && (Type != HeaderType::Project)) {
+      auto UmbrellaName = "/" + Regex::escape(FrameworkName);
+      if (Type == HeaderType::Public)
+        UmbrellaName += "\\.h";
+      else
+        UmbrellaName += "[_]?Private\\.h";
+      Regex UmbrellaRegex(UmbrellaName);
+      MarkandMoveUmbrellaInHeaders(UmbrellaRegex, Type);
+    }
+    return true;
+  };
+  if (!FindUmbrellaHeader(DriverOpts.PublicUmbrellaHeader,
+                          HeaderType::Public) ||
+      !FindUmbrellaHeader(DriverOpts.PrivateUmbrellaHeader,
+                          HeaderType::Private) ||
+      !FindUmbrellaHeader(DriverOpts.ProjectUmbrellaHeader,
+                          HeaderType::Project))
+    return Ctx;
+
   // Parse binary dylib and initialize verifier.
   if (DriverOpts.DylibToVerify.empty()) {
     Ctx.Verifier = std::make_unique<DylibVerifier>();
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index c18309f693701e..3671e4c8274bd3 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -31,6 +31,15 @@ struct DriverOptions {
   /// \brief Path to input file lists (JSON).
   llvm::MachO::PathSeq FileLists;
 
+  /// \brief Path to public umbrella header.
+  std::string PublicUmbrellaHeader;
+
+  /// \brief Path to private umbrella header.
+  std::string PrivateUmbrellaHeader;
+
+  /// \brief Path to project umbrella header.
+  std::string ProjectUmbrellaHeader;
+
   /// \brief Paths of extra public headers.
   PathSeq ExtraPublicHeaders;
 

>From 0099c584bad3bdeb62fede61fb89fdcc022bd2a0 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley at gmail.com>
Date: Wed, 27 Mar 2024 09:35:45 -0700
Subject: [PATCH 46/54] [bazel] Remove -lm on macOS (#86706)

Bazel links this library by default which leads to this linker warning
on macOS:

```
ld: warning: ignoring duplicate libraries: '-lm'
```
---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 0e658353c36f3d..3c3e17bfec668f 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -292,6 +292,10 @@ cc_library(
             "-ldl",
             "-lm",
         ],
+        "@platforms//os:macos": [
+            "-pthread",
+            "-ldl",
+        ],
         "//conditions:default": [
             "-pthread",
             "-ldl",

>From fca48312a833464369ce1615c60e09f1d71e4aad Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Mar 2024 16:35:46 +0000
Subject: [PATCH 47/54] Fix signed/unsigned comparison warning. NFC.

---
 compiler-rt/lib/scudo/standalone/tests/strings_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
index e068c48fc97c9a..3e41f67ba922b7 100644
--- a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
@@ -141,7 +141,7 @@ TEST(ScudoStringsTest, CapacityIncreaseFails) {
 
   // Test requires that the default length is at least 6 characters.
   scudo::uptr MaxSize = Str.capacity();
-  EXPECT_LE(6, MaxSize);
+  EXPECT_LE(6u, MaxSize);
 
   for (size_t i = 0; i < MaxSize - 5; i++) {
     Str.append("B");

>From 4d177435bae03551245ffdc4dfcee5345323121d Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek at amd.com>
Date: Wed, 27 Mar 2024 11:37:09 -0500
Subject: [PATCH 48/54] [flang][OpenMP] Rename makeList overloads to
 make{Objects,Clauses}, NFC (#86725)

Reserve `makeList` to create a list given an explicit converter
function.
---
 flang/lib/Lower/OpenMP/ClauseProcessor.h      |  2 +-
 flang/lib/Lower/OpenMP/Clauses.cpp            | 52 +++++++++----------
 flang/lib/Lower/OpenMP/Clauses.h              |  8 +--
 flang/lib/Lower/OpenMP/DataSharingProcessor.h |  2 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  4 +-
 5 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index c0c603feb296af..d31d6a5c20623a 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -51,7 +51,7 @@ class ClauseProcessor {
                   Fortran::semantics::SemanticsContext &semaCtx,
                   const Fortran::parser::OmpClauseList &clauses)
       : converter(converter), semaCtx(semaCtx),
-        clauses(makeList(clauses, semaCtx)) {}
+        clauses(makeClauses(clauses, semaCtx)) {}
 
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool processCollapse(
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index f48e84f511a44d..853dcd78e26683 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -347,7 +347,7 @@ Aligned make(const parser::OmpClause::Aligned &inp,
 
   return Aligned{{
       /*Alignment=*/maybeApply(makeExprFn(semaCtx), t1),
-      /*List=*/makeList(t0, semaCtx),
+      /*List=*/makeObjects(t0, semaCtx),
   }};
 }
 
@@ -362,7 +362,7 @@ Allocate make(const parser::OmpClause::Allocate &inp,
     return Allocate{{/*AllocatorSimpleModifier=*/std::nullopt,
                      /*AllocatorComplexModifier=*/std::nullopt,
                      /*AlignModifier=*/std::nullopt,
-                     /*List=*/makeList(t1, semaCtx)}};
+                     /*List=*/makeObjects(t1, semaCtx)}};
   }
 
   using Tuple = decltype(Allocate::t);
@@ -374,7 +374,7 @@ Allocate make(const parser::OmpClause::Allocate &inp,
             return {/*AllocatorSimpleModifier=*/makeExpr(v.v, semaCtx),
                     /*AllocatorComplexModifier=*/std::nullopt,
                     /*AlignModifier=*/std::nullopt,
-                    /*List=*/makeList(t1, semaCtx)};
+                    /*List=*/makeObjects(t1, semaCtx)};
           },
           // complex-modifier + align-modifier
           [&](const wrapped::AllocateModifier::ComplexModifier &v) -> Tuple {
@@ -384,14 +384,14 @@ Allocate make(const parser::OmpClause::Allocate &inp,
                 /*AllocatorSimpleModifier=*/std::nullopt,
                 /*AllocatorComplexModifier=*/Allocator{makeExpr(s0.v, semaCtx)},
                 /*AlignModifier=*/Align{makeExpr(s1.v, semaCtx)},
-                /*List=*/makeList(t1, semaCtx)};
+                /*List=*/makeObjects(t1, semaCtx)};
           },
           // align-modifier
           [&](const wrapped::AllocateModifier::Align &v) -> Tuple {
             return {/*AllocatorSimpleModifier=*/std::nullopt,
                     /*AllocatorComplexModifier=*/std::nullopt,
                     /*AlignModifier=*/Align{makeExpr(v.v, semaCtx)},
-                    /*List=*/makeList(t1, semaCtx)};
+                    /*List=*/makeObjects(t1, semaCtx)};
           },
       },
       t0->u)};
@@ -450,13 +450,13 @@ Collapse make(const parser::OmpClause::Collapse &inp,
 Copyin make(const parser::OmpClause::Copyin &inp,
             semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Copyin{/*List=*/makeList(inp.v, semaCtx)};
+  return Copyin{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Copyprivate make(const parser::OmpClause::Copyprivate &inp,
                  semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Copyprivate{/*List=*/makeList(inp.v, semaCtx)};
+  return Copyprivate{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Default make(const parser::OmpClause::Default &inp,
@@ -641,7 +641,7 @@ Doacross make(const parser::OmpClause::Doacross &inp,
 Enter make(const parser::OmpClause::Enter &inp,
            semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Enter{makeList(/*List=*/inp.v, semaCtx)};
+  return Enter{makeObjects(/*List=*/inp.v, semaCtx)};
 }
 
 Exclusive make(const parser::OmpClause::Exclusive &inp,
@@ -671,7 +671,7 @@ Final make(const parser::OmpClause::Final &inp,
 Firstprivate make(const parser::OmpClause::Firstprivate &inp,
                   semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Firstprivate{/*List=*/makeList(inp.v, semaCtx)};
+  return Firstprivate{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 // Flush: empty
@@ -681,7 +681,7 @@ From make(const parser::OmpClause::From &inp,
   // inp.v -> parser::OmpObjectList
   return From{{/*Expectation=*/std::nullopt, /*Mapper=*/std::nullopt,
                /*Iterator=*/std::nullopt,
-               /*LocatorList=*/makeList(inp.v, semaCtx)}};
+               /*LocatorList=*/makeObjects(inp.v, semaCtx)}};
 }
 
 // Full: empty
@@ -696,7 +696,7 @@ Grainsize make(const parser::OmpClause::Grainsize &inp,
 HasDeviceAddr make(const parser::OmpClause::HasDeviceAddr &inp,
                    semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return HasDeviceAddr{/*List=*/makeList(inp.v, semaCtx)};
+  return HasDeviceAddr{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Hint make(const parser::OmpClause::Hint &inp,
@@ -762,20 +762,20 @@ InReduction make(const parser::OmpClause::InReduction &inp,
   auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
   return InReduction{
       {/*ReductionIdentifiers=*/{makeReductionOperator(t0, semaCtx)},
-       /*List=*/makeList(t1, semaCtx)}};
+       /*List=*/makeObjects(t1, semaCtx)}};
 }
 
 IsDevicePtr make(const parser::OmpClause::IsDevicePtr &inp,
                  semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return IsDevicePtr{/*List=*/makeList(inp.v, semaCtx)};
+  return IsDevicePtr{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Lastprivate make(const parser::OmpClause::Lastprivate &inp,
                  semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
   return Lastprivate{{/*LastprivateModifier=*/std::nullopt,
-                      /*List=*/makeList(inp.v, semaCtx)}};
+                      /*List=*/makeObjects(inp.v, semaCtx)}};
 }
 
 Linear make(const parser::OmpClause::Linear &inp,
@@ -817,7 +817,7 @@ Linear make(const parser::OmpClause::Linear &inp,
 Link make(const parser::OmpClause::Link &inp,
           semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Link{/*List=*/makeList(inp.v, semaCtx)};
+  return Link{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 Map make(const parser::OmpClause::Map &inp,
@@ -844,7 +844,7 @@ Map make(const parser::OmpClause::Map &inp,
   if (!t0) {
     return Map{{/*MapType=*/std::nullopt, /*MapTypeModifiers=*/std::nullopt,
                 /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
-                /*LocatorList=*/makeList(t1, semaCtx)}};
+                /*LocatorList=*/makeObjects(t1, semaCtx)}};
   }
 
   auto &s0 = std::get<std::optional<parser::OmpMapType::Always>>(t0->t);
@@ -857,7 +857,7 @@ Map make(const parser::OmpClause::Map &inp,
   return Map{{/*MapType=*/convert1(s1),
               /*MapTypeModifiers=*/maybeList,
               /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
-              /*LocatorList=*/makeList(t1, semaCtx)}};
+              /*LocatorList=*/makeObjects(t1, semaCtx)}};
 }
 
 // Match: incomplete
@@ -980,7 +980,7 @@ Priority make(const parser::OmpClause::Priority &inp,
 Private make(const parser::OmpClause::Private &inp,
              semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Private{/*List=*/makeList(inp.v, semaCtx)};
+  return Private{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 ProcBind make(const parser::OmpClause::ProcBind &inp,
@@ -1010,7 +1010,7 @@ Reduction make(const parser::OmpClause::Reduction &inp,
   return Reduction{
       {/*ReductionIdentifiers=*/{makeReductionOperator(t0, semaCtx)},
        /*ReductionModifier=*/std::nullopt,
-       /*List=*/makeList(t1, semaCtx)}};
+       /*List=*/makeObjects(t1, semaCtx)}};
 }
 
 // Relaxed: empty
@@ -1104,7 +1104,7 @@ Severity make(const parser::OmpClause::Severity &inp,
 Shared make(const parser::OmpClause::Shared &inp,
             semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return Shared{/*List=*/makeList(inp.v, semaCtx)};
+  return Shared{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 // Simd: empty
@@ -1128,7 +1128,7 @@ TaskReduction make(const parser::OmpClause::TaskReduction &inp,
   auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
   return TaskReduction{
       {/*ReductionIdentifiers=*/{makeReductionOperator(t0, semaCtx)},
-       /*List=*/makeList(t1, semaCtx)}};
+       /*List=*/makeObjects(t1, semaCtx)}};
 }
 
 ThreadLimit make(const parser::OmpClause::ThreadLimit &inp,
@@ -1145,7 +1145,7 @@ To make(const parser::OmpClause::To &inp,
   // inp.v -> parser::OmpObjectList
   return To{{/*Expectation=*/std::nullopt, /*Mapper=*/std::nullopt,
              /*Iterator=*/std::nullopt,
-             /*LocatorList=*/makeList(inp.v, semaCtx)}};
+             /*LocatorList=*/makeObjects(inp.v, semaCtx)}};
 }
 
 // UnifiedAddress: empty
@@ -1175,13 +1175,13 @@ Use make(const parser::OmpClause::Use &inp,
 UseDeviceAddr make(const parser::OmpClause::UseDeviceAddr &inp,
                    semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return UseDeviceAddr{/*List=*/makeList(inp.v, semaCtx)};
+  return UseDeviceAddr{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 UseDevicePtr make(const parser::OmpClause::UseDevicePtr &inp,
                   semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpObjectList
-  return UseDevicePtr{/*List=*/makeList(inp.v, semaCtx)};
+  return UseDevicePtr{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
 UsesAllocators make(const parser::OmpClause::UsesAllocators &inp,
@@ -1205,8 +1205,8 @@ Clause makeClause(const Fortran::parser::OmpClause &cls,
       cls.u);
 }
 
-List<Clause> makeList(const parser::OmpClauseList &clauses,
-                      semantics::SemanticsContext &semaCtx) {
+List<Clause> makeClauses(const parser::OmpClauseList &clauses,
+                         semantics::SemanticsContext &semaCtx) {
   return makeList(clauses.v, [&](const parser::OmpClause &s) {
     return makeClause(s, semaCtx);
   });
diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h
index af1318226e8cb1..3e776425c733e0 100644
--- a/flang/lib/Lower/OpenMP/Clauses.h
+++ b/flang/lib/Lower/OpenMP/Clauses.h
@@ -88,8 +88,8 @@ List<ResultTy> makeList(ContainerTy &&container, FunctionTy &&func) {
   return v;
 }
 
-inline ObjectList makeList(const parser::OmpObjectList &objects,
-                           semantics::SemanticsContext &semaCtx) {
+inline ObjectList makeObjects(const parser::OmpObjectList &objects,
+                              semantics::SemanticsContext &semaCtx) {
   return makeList(objects.v, makeObjectFn(semaCtx));
 }
 
@@ -256,8 +256,8 @@ Clause makeClause(llvm::omp::Clause id, Specific &&specific,
 Clause makeClause(const Fortran::parser::OmpClause &cls,
                   semantics::SemanticsContext &semaCtx);
 
-List<Clause> makeList(const parser::OmpClauseList &clauses,
-                      semantics::SemanticsContext &semaCtx);
+List<Clause> makeClauses(const parser::OmpClauseList &clauses,
+                         semantics::SemanticsContext &semaCtx);
 } // namespace Fortran::lower::omp
 
 #endif // FORTRAN_LOWER_OPENMP_CLAUSES_H
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 226abe96705e35..1cbc825fd5e11b 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -89,7 +89,7 @@ class DataSharingProcessor {
                        Fortran::lower::SymMap *symTable = nullptr)
       : hasLastPrivateOp(false), converter(converter),
         firOpBuilder(converter.getFirOpBuilder()),
-        clauses(omp::makeList(opClauseList, semaCtx)), eval(eval),
+        clauses(omp::makeClauses(opClauseList, semaCtx)), eval(eval),
         useDelayedPrivatization(useDelayedPrivatization), symTable(symTable) {}
 
   // Privatisation is split into two steps.
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 0cf2a8f97040a8..5defffd738b4e8 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1254,7 +1254,7 @@ static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
 
   if (const auto *objectList{
           Fortran::parser::Unwrap<Fortran::parser::OmpObjectList>(spec.u)}) {
-    ObjectList objects{makeList(*objectList, semaCtx)};
+    ObjectList objects{makeObjects(*objectList, semaCtx)};
     // Case: declare target(func, var1, var2)
     gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to,
                          symbolAndClause);
@@ -2352,7 +2352,7 @@ void Fortran::lower::genOpenMPReduction(
     const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  List<Clause> clauses{makeList(clauseList, semaCtx)};
+  List<Clause> clauses{makeClauses(clauseList, semaCtx)};
 
   for (const Clause &clause : clauses) {
     if (const auto &reductionClause =

>From 1c965801c42c92ff0b768e31348285514ecf5511 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 27 Mar 2024 09:39:35 -0700
Subject: [PATCH 49/54] [LegalizeDAG] Merge PerformInsertVectorEltInMemory into
 ExpandInsertToVectorThroughStack. NFC (#86755)

These functions are very similar. We can share them like we do for
EXTRACT_VECTOR_ELT and EXTRACT_SUBVECTOR.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 83 ++++++-------------
 1 file changed, 26 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index b1f6fd6f3c7220..e10b8bc8c5e2eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -118,14 +118,7 @@ class SelectionDAGLegalize {
   void LegalizeLoadOps(SDNode *Node);
   void LegalizeStoreOps(SDNode *Node);
 
-  /// Some targets cannot handle a variable
-  /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
-  /// is necessary to spill the vector being inserted into to memory, perform
-  /// the insert there, and then read the result back.
-  SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
-                                         const SDLoc &dl);
-  SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx,
-                                  const SDLoc &dl);
+  SDValue ExpandINSERT_VECTOR_ELT(SDValue Op);
 
   /// Return a vector shuffle operation which
   /// performs the same shuffe in terms of order or result bytes, but on a type
@@ -378,45 +371,12 @@ SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
   return Result;
 }
 
-/// Some target cannot handle a variable insertion index for the
-/// INSERT_VECTOR_ELT instruction.  In this case, it
-/// is necessary to spill the vector being inserted into to memory, perform
-/// the insert there, and then read the result back.
-SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
-                                                             SDValue Val,
-                                                             SDValue Idx,
-                                                             const SDLoc &dl) {
-  // If the target doesn't support this, we have to spill the input vector
-  // to a temporary stack slot, update the element, then reload it.  This is
-  // badness.  We could also load the value into a vector register (either
-  // with a "move to register" or "extload into register" instruction, then
-  // permute it into place, if the idx is a constant and if the idx is
-  // supported by the target.
-  EVT VT    = Vec.getValueType();
-  EVT EltVT = VT.getVectorElementType();
-  SDValue StackPtr = DAG.CreateStackTemporary(VT);
-
-  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
-
-  // Store the vector.
-  SDValue Ch = DAG.getStore(
-      DAG.getEntryNode(), dl, Vec, StackPtr,
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
-
-  SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Idx);
-
-  // Store the scalar value.
-  Ch = DAG.getTruncStore(
-      Ch, dl, Val, StackPtr2,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
-  // Load the updated vector.
-  return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
-                                               DAG.getMachineFunction(), SPFI));
-}
+SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Op) {
+  SDValue Vec = Op.getOperand(0);
+  SDValue Val = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  SDLoc dl(Op);
 
-SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
-                                                      SDValue Idx,
-                                                      const SDLoc &dl) {
   if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
     // SCALAR_TO_VECTOR requires that the type of the value being inserted
     // match the element type of the vector being created, except for
@@ -438,7 +398,7 @@ SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
       return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec, ShufOps);
     }
   }
-  return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
+  return ExpandInsertToVectorThroughStack(Op);
 }
 
 SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
@@ -1486,7 +1446,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
 
   // Store the value to a temporary stack slot, then LOAD the returned part.
   EVT VecVT = Vec.getValueType();
-  EVT SubVecVT = Part.getValueType();
+  EVT PartVT = Part.getValueType();
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   MachinePointerInfo PtrInfo =
@@ -1496,13 +1456,24 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
 
   // Then store the inserted part.
-  SDValue SubStackPtr =
-      TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx);
+  if (PartVT.isVector()) {
+    SDValue SubStackPtr =
+        TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, PartVT, Idx);
+
+    // Store the subvector.
+    Ch = DAG.getStore(
+        Ch, dl, Part, SubStackPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+  } else {
+    SDValue SubStackPtr =
+        TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
 
-  // Store the subvector.
-  Ch = DAG.getStore(
-      Ch, dl, Part, SubStackPtr,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+    // Store the scalar value.
+    Ch = DAG.getTruncStore(
+        Ch, dl, Part, SubStackPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
+        VecVT.getVectorElementType());
+  }
 
   // Finally, load the updated vector.
   return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
@@ -3416,9 +3387,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
     break;
   case ISD::INSERT_VECTOR_ELT:
-    Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
-                                              Node->getOperand(1),
-                                              Node->getOperand(2), dl));
+    Results.push_back(ExpandINSERT_VECTOR_ELT(SDValue(Node, 0)));
     break;
   case ISD::VECTOR_SHUFFLE: {
     SmallVector<int, 32> NewMask;

>From c335accb07c0cfa4bd7f47edc94c9005692edfcc Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Wed, 27 Mar 2024 09:47:16 -0700
Subject: [PATCH 50/54] [ELF] --pack-dyn-relocs=android+relr: place IRELATIVE
 in .rela.plt (#86751)

Current Bionic processes relocations in this order:

* DT_ANDROID_REL[A]
* DT_RELR
* DT_REL[A]
* DT_JMPREL

If an IRELATIVE relocation is in DT_ANDROID_REL[A], it would read
unrelocated (incorrect) global variables associated with RELR when
--pack-dyn-relocs=android+relr is enabled. Work around this by placing
IRELATIVE in .rel[a].plt (DT_JMPREL).

Link: https://r.android.com/3014185
---
 lld/ELF/Relocations.cpp              | 11 +++++--
 lld/test/ELF/pack-dyn-relocs-ifunc.s | 49 ++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 lld/test/ELF/pack-dyn-relocs-ifunc.s

diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 33c50133bec495..92f2e200db1107 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1659,10 +1659,17 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) {
   // original section/value pairs. For non-GOT non-PLT relocation case below, we
   // may alter section/value, so create a copy of the symbol to make
   // section/value fixed.
+  //
+  // Prior to Android V, there was a bug that caused RELR relocations to be
+  // applied after packed relocations. This meant that resolvers referenced by
+  // IRELATIVE relocations in the packed relocation section would read
+  // unrelocated globals with RELR relocations when
+  // --pack-relative-relocs=android+relr is enabled. Work around this by placing
+  // IRELATIVE in .rela.plt.
   auto *directSym = makeDefined(cast<Defined>(sym));
   directSym->allocateAux();
-  addPltEntry(*in.iplt, *in.igotPlt, *mainPart->relaDyn, target->iRelativeRel,
-              *directSym);
+  auto &dyn = config->androidPackDynRelocs ? *in.relaPlt : *mainPart->relaDyn;
+  addPltEntry(*in.iplt, *in.igotPlt, dyn, target->iRelativeRel, *directSym);
   sym.allocateAux();
   symAux.back().pltIdx = symAux[directSym->auxIdx].pltIdx;
 
diff --git a/lld/test/ELF/pack-dyn-relocs-ifunc.s b/lld/test/ELF/pack-dyn-relocs-ifunc.s
new file mode 100644
index 00000000000000..6168d06f99d9e9
--- /dev/null
+++ b/lld/test/ELF/pack-dyn-relocs-ifunc.s
@@ -0,0 +1,49 @@
+# REQUIRES: aarch64
+## Prior to Android V, there was a bug that caused RELR relocations to be
+## applied after packed relocations. This meant that resolvers referenced by
+## IRELATIVE relocations in the packed relocation section would read unrelocated
+## globals when --pack-relative-relocs=android+relr is enabled. Work around this
+## by placing IRELATIVE in .rela.plt.
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-android a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-android b.s -o b.o
+# RUN: ld.lld -shared b.o -o b.so
+# RUN: ld.lld -pie --pack-dyn-relocs=android+relr -z separate-loadable-segments a.o b.so -o a
+# RUN: llvm-readobj -r a | FileCheck %s
+# RUN: llvm-objdump -d a | FileCheck %s --check-prefix=ASM
+
+# CHECK:      .relr.dyn {
+# CHECK-NEXT:   0x30000 R_AARCH64_RELATIVE -
+# CHECK-NEXT: }
+# CHECK:      .rela.plt {
+# CHECK-NEXT:   0x30020 R_AARCH64_JUMP_SLOT bar 0x0
+# CHECK-NEXT:   0x30028 R_AARCH64_IRELATIVE - 0x10000
+# CHECK-NEXT: }
+
+# ASM:      <.iplt>:
+# ASM-NEXT:   adrp    x16, 0x30000
+# ASM-NEXT:   ldr     x17, [x16, #0x28]
+# ASM-NEXT:   add     x16, x16, #0x28
+# ASM-NEXT:   br      x17
+
+#--- a.s
+.text
+.type foo, %gnu_indirect_function
+.globl foo
+foo:
+  ret
+
+.globl _start
+_start:
+  bl foo
+  bl bar
+
+.data
+.balign 8
+.quad .data
+
+#--- b.s
+.globl bar
+bar:
+  ret

>From dcd0f2b6103072b74b446c2d1e9ecec60001a28c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Mar 2024 17:01:41 +0000
Subject: [PATCH 51/54] [X86] combineExtractFromVectorLoad support extraction
 from vector of different types to the extraction type/index

combineExtractFromVectorLoad no longer uses the vector we're extracting from to determine the pointer offset calculation, allowing us to extract from types that have been bitcast to work with specific target shuffles.

Fixes #85419
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  23 +-
 llvm/test/CodeGen/X86/extractelement-load.ll  |  35 +-
 llvm/test/CodeGen/X86/pr45378.ll              |  40 +--
 .../test/CodeGen/X86/setcc-non-simple-type.ll |  36 +--
 llvm/test/CodeGen/X86/var-permute-128.ll      |  32 +-
 llvm/test/CodeGen/X86/vec_int_to_fp.ll        | 305 +++++++-----------
 6 files changed, 179 insertions(+), 292 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a229f6e55a9880..9d98d31b31df0b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43999,18 +43999,18 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
 // integer, that requires a potentially expensive XMM -> GPR transfer.
 // Additionally, if we can convert to a scalar integer load, that will likely
 // be folded into a subsequent integer op.
+// Note: SrcVec might not have a VecVT type, but it must be the same size.
 // Note: Unlike the related fold for this in DAGCombiner, this is not limited
 //       to a single-use of the loaded vector. For the reasons above, we
 //       expect this to be profitable even if it creates an extra load.
 static SDValue
-combineExtractFromVectorLoad(SDNode *N, SDValue InputVector, uint64_t Idx,
+combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx,
                              const SDLoc &dl, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
   assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
          "Only EXTRACT_VECTOR_ELT supported so far");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT SrcVT = InputVector.getValueType();
   EVT VT = N->getValueType(0);
 
   bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
@@ -44019,12 +44019,13 @@ combineExtractFromVectorLoad(SDNode *N, SDValue InputVector, uint64_t Idx,
            Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
   });
 
-  auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
+  auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
   if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
-      SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
-      !LikelyUsedAsVector && LoadVec->isSimple()) {
+      VecVT.getVectorElementType() == VT &&
+      VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
+      DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
     SDValue NewPtr = TLI.getVectorElementPointer(
-        DAG, LoadVec->getBasePtr(), SrcVT, DAG.getVectorIdxConstant(Idx, dl));
+        DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
     unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
     MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
     Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
@@ -44234,10 +44235,9 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
     return DAG.getZExtOrTrunc(V, dl, VT);
 
-  if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT &&
-      SrcOp.getValueType() == SrcVT)
-    if (SDValue V =
-            combineExtractFromVectorLoad(N, SrcOp, ExtractIdx, dl, DAG, DCI))
+  if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
+    if (SDValue V = combineExtractFromVectorLoad(
+            N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
       return V;
 
   return SDValue();
@@ -44651,7 +44651,8 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
 
   if (CIdx)
     if (SDValue V = combineExtractFromVectorLoad(
-            N, InputVector, CIdx->getZExtValue(), dl, DAG, DCI))
+            N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
+            dl, DAG, DCI))
       return V;
 
   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index ba2217f704bd72..022b25a2415333 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -76,11 +76,9 @@ bb:
 define i64 @t4(ptr %a) {
 ; X86-SSE2-LABEL: t4:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqa (%eax), %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl (%ecx), %eax
+; X86-SSE2-NEXT:    movl 4(%ecx), %edx
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-LABEL: t4:
@@ -289,24 +287,15 @@ define i32 @PR85419(ptr %p0) {
 ; X86-SSE2-NEXT:  .LBB8_2:
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-SSSE3-LABEL: PR85419:
-; X64-SSSE3:       # %bb.0:
-; X64-SSSE3-NEXT:    xorl %ecx, %ecx
-; X64-SSSE3-NEXT:    cmpq $0, (%rdi)
-; X64-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
-; X64-SSSE3-NEXT:    movd %xmm0, %eax
-; X64-SSSE3-NEXT:    cmovel %ecx, %eax
-; X64-SSSE3-NEXT:    retq
-;
-; X64-AVX-LABEL: PR85419:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    xorl %eax, %eax
-; X64-AVX-NEXT:    cmpq $0, (%rdi)
-; X64-AVX-NEXT:    je .LBB8_2
-; X64-AVX-NEXT:  # %bb.1:
-; X64-AVX-NEXT:    movl 8(%rdi), %eax
-; X64-AVX-NEXT:  .LBB8_2:
-; X64-AVX-NEXT:    retq
+; X64-LABEL: PR85419:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq $0, (%rdi)
+; X64-NEXT:    je .LBB8_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl 8(%rdi), %eax
+; X64-NEXT:  .LBB8_2:
+; X64-NEXT:    retq
   %load = load <2 x i64>, ptr %p0, align 16
   %vecext.i = extractelement <2 x i64> %load, i64 0
   %cmp = icmp eq i64 %vecext.i, 0
diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll
index 426f4eed662a09..6a5770a4b4ad30 100644
--- a/llvm/test/CodeGen/X86/pr45378.ll
+++ b/llvm/test/CodeGen/X86/pr45378.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2     | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1   | FileCheck %s --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx      | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2     | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f  | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2     | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1   | FileCheck %s --check-prefixes=CHECK,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx      | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2     | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f  | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX
 
 declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
 
@@ -71,28 +71,12 @@ define i1 @parseHeaders2_scalar_or(ptr %ptr) nounwind {
 }
 
 define i1 @parseHeaders2_scalar_and(ptr %ptr) nounwind {
-; SSE2-LABEL: parseHeaders2_scalar_and:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, (%rdi)
-; SSE2-NEXT:    sete %al
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: parseHeaders2_scalar_and:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq (%rdi), %rax
-; SSE41-NEXT:    testq %rax, 8(%rdi)
-; SSE41-NEXT:    sete %al
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: parseHeaders2_scalar_and:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    testq %rax, 8(%rdi)
-; AVX-NEXT:    sete %al
-; AVX-NEXT:    retq
+; CHECK-LABEL: parseHeaders2_scalar_and:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    testq %rax, 8(%rdi)
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
   %vload = load <2 x i64>, ptr %ptr, align 8
   %v1 = extractelement <2 x i64> %vload, i32 0
   %v2 = extractelement <2 x i64> %vload, i32 1
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 2187c653f76c3b..97c3c2040b2914 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -60,36 +60,30 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movdqu 1024(%rdx,%rdi), %xmm5
-; CHECK-NEXT:    movdqu 1040(%rdx,%rdi), %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; CHECK-NEXT:    movq %xmm5, %r8
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; CHECK-NEXT:    movq %xmm5, %r9
-; CHECK-NEXT:    cmpq 1040(%rdx,%rdi), %rsi
-; CHECK-NEXT:    movq %rcx, %r10
-; CHECK-NEXT:    sbbq %r9, %r10
-; CHECK-NEXT:    setge %r9b
-; CHECK-NEXT:    movzbl %r9b, %r9d
-; CHECK-NEXT:    andl $1, %r9d
-; CHECK-NEXT:    negq %r9
-; CHECK-NEXT:    movq %r9, %xmm5
 ; CHECK-NEXT:    cmpq 1024(%rdx,%rdi), %rsi
-; CHECK-NEXT:    movq %rcx, %r9
-; CHECK-NEXT:    sbbq %r8, %r9
+; CHECK-NEXT:    movq %rcx, %r8
+; CHECK-NEXT:    sbbq 1032(%rdx,%rdi), %r8
+; CHECK-NEXT:    setge %r8b
+; CHECK-NEXT:    movzbl %r8b, %r8d
+; CHECK-NEXT:    andl $1, %r8d
+; CHECK-NEXT:    negq %r8
+; CHECK-NEXT:    movq %r8, %xmm5
+; CHECK-NEXT:    cmpq 1040(%rdx,%rdi), %rsi
+; CHECK-NEXT:    movq %rcx, %r8
+; CHECK-NEXT:    sbbq 1048(%rdx,%rdi), %r8
 ; CHECK-NEXT:    setge %r8b
 ; CHECK-NEXT:    movzbl %r8b, %r8d
 ; CHECK-NEXT:    andl $1, %r8d
 ; CHECK-NEXT:    negq %r8
 ; CHECK-NEXT:    movq %r8, %xmm6
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
-; CHECK-NEXT:    movdqa %xmm1, %xmm5
-; CHECK-NEXT:    psllq %xmm4, %xmm5
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm6
+; CHECK-NEXT:    psllq %xmm4, %xmm6
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
 ; CHECK-NEXT:    movdqa %xmm1, %xmm8
 ; CHECK-NEXT:    psllq %xmm7, %xmm8
-; CHECK-NEXT:    movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
-; CHECK-NEXT:    andpd %xmm6, %xmm8
+; CHECK-NEXT:    movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
+; CHECK-NEXT:    andpd %xmm5, %xmm8
 ; CHECK-NEXT:    orpd %xmm8, %xmm3
 ; CHECK-NEXT:    paddq %xmm2, %xmm4
 ; CHECK-NEXT:    addq $32, %rdi
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 99a3821bb9ba91..f2240a94684427 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -1101,17 +1101,13 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
 define void @indices_convert() {
 ; SSE3-LABEL: indices_convert:
 ; SSE3:       # %bb.0: # %bb
-; SSE3-NEXT:    movdqa (%rax), %xmm0
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE3-NEXT:    movd %xmm1, %eax
-; SSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps (%rax), %xmm0
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movl (%rax), %eax
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE3-NEXT:    andl $3, %eax
-; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSE3-NEXT:    movd %xmm1, %ecx
-; SSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT:    andl $3, %ecx
 ; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -1120,17 +1116,13 @@ define void @indices_convert() {
 ;
 ; SSSE3-LABEL: indices_convert:
 ; SSSE3:       # %bb.0: # %bb
-; SSSE3-NEXT:    movdqa (%rax), %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSSE3-NEXT:    movd %xmm1, %eax
-; SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps (%rax), %xmm0
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movl (%rax), %eax
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSSE3-NEXT:    andl $3, %eax
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSSE3-NEXT:    movd %xmm1, %ecx
-; SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    andl $3, %ecx
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 7bbcdee9a6802e..e26de4be7066fa 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -2911,23 +2911,12 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
 ;
 
 define <2 x double> @sitofp_load_2i64_to_2f64(ptr%a) {
-; SSE2-LABEL: sitofp_load_2i64_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    cvtsi2sdq (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_2i64_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
-; SSE41-NEXT:    cvtsi2sdq (%rdi), %xmm0
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    retq
+; SSE-LABEL: sitofp_load_2i64_to_2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
+; SSE-NEXT:    cvtsi2sdq (%rdi), %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_2i64_to_2f64:
 ; VEX:       # %bb.0:
@@ -3093,35 +3082,16 @@ define <2 x double> @sitofp_load_2i8_to_2f64(ptr%a) {
 }
 
 define <4 x double> @sitofp_load_4i64_to_4f64(ptr%a) {
-; SSE2-LABEL: sitofp_load_4i64_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE2-NEXT:    cvtsi2sdq (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sdq 16(%rdi), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_4i64_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
-; SSE41-NEXT:    cvtsi2sdq (%rdi), %xmm0
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    cvtsi2sdq 24(%rdi), %xmm2
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2sdq 16(%rdi), %xmm1
-; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE41-NEXT:    retq
+; SSE-LABEL: sitofp_load_4i64_to_4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsi2sdq 8(%rdi), %xmm1
+; SSE-NEXT:    cvtsi2sdq (%rdi), %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    cvtsi2sdq 24(%rdi), %xmm2
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    cvtsi2sdq 16(%rdi), %xmm1
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_4i64_to_4f64:
 ; VEX:       # %bb.0:
@@ -3865,22 +3835,14 @@ define <4 x double> @uitofp_load_4i8_to_4f64(ptr%a) {
 define <4 x float> @sitofp_load_4i64_to_4f32(ptr%a) {
 ; SSE2-LABEL: sitofp_load_4i64_to_4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    cvtsi2ssq 24(%rdi), %xmm0
+; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    cvtsi2ssq 8(%rdi), %xmm2
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ssq (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: sitofp_load_4i64_to_4f32:
@@ -4015,39 +3977,24 @@ define <4 x float> @sitofp_load_4i8_to_4f32(ptr%a) {
 define <8 x float> @sitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-LABEL: sitofp_load_8i64_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
-; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT:    cvtsi2ssq 24(%rdi), %xmm0
+; SSE2-NEXT:    cvtsi2ssq 16(%rdi), %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    cvtsi2ssq 8(%rdi), %xmm2
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ssq (%rdi), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE2-NEXT:    xorps %xmm4, %xmm4
-; SSE2-NEXT:    cvtsi2ssq 48(%rdi), %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    cvtsi2ssq 56(%rdi), %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ssq 48(%rdi), %xmm2
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    cvtsi2ssq 40(%rdi), %xmm3
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2ssq 32(%rdi), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: sitofp_load_8i64_to_8f32:
@@ -4256,70 +4203,64 @@ define <8 x float> @sitofp_load_8i8_to_8f32(ptr%a) {
 define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
 ; SSE2-LABEL: uitofp_load_4i64_to_4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_1
 ; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    jmp .LBB83_3
 ; SSE2-NEXT:  .LBB83_1:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
 ; SSE2-NEXT:  .LBB83_3:
-; SSE2-NEXT:    movq (%rdi), %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_4
 ; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    jmp .LBB83_6
 ; SSE2-NEXT:  .LBB83_4:
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:  .LBB83_6:
+; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 8(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    js .LBB83_7
+; SSE2-NEXT:  # %bb.8:
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
+; SSE2-NEXT:    jmp .LBB83_9
+; SSE2-NEXT:  .LBB83_7:
 ; SSE2-NEXT:    movq %rcx, %rdx
 ; SSE2-NEXT:    shrq %rdx
 ; SSE2-NEXT:    andl $1, %ecx
 ; SSE2-NEXT:    orq %rdx, %rcx
 ; SSE2-NEXT:    cvtsi2ss %rcx, %xmm2
 ; SSE2-NEXT:    addss %xmm2, %xmm2
-; SSE2-NEXT:  .LBB83_6:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB83_7
-; SSE2-NEXT:  # %bb.8:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB83_9
-; SSE2-NEXT:  .LBB83_7:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
 ; SSE2-NEXT:  .LBB83_9:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB83_10
 ; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    jmp .LBB83_12
 ; SSE2-NEXT:  .LBB83_10:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
+; SSE2-NEXT:    addss %xmm0, %xmm0
 ; SSE2-NEXT:  .LBB83_12:
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -4591,8 +4532,7 @@ define <4 x float> @uitofp_load_4i8_to_4f32(ptr%a) {
 define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-LABEL: uitofp_load_8i64_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_1
 ; SSE2-NEXT:  # %bb.2:
@@ -4606,127 +4546,114 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
 ; SSE2-NEXT:    addss %xmm2, %xmm2
 ; SSE2-NEXT:  .LBB87_3:
-; SSE2-NEXT:    movq (%rdi), %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_4
 ; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    jmp .LBB87_6
 ; SSE2-NEXT:  .LBB87_4:
-; SSE2-NEXT:    movq %rcx, %rdx
-; SSE2-NEXT:    shrq %rdx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    addss %xmm1, %xmm1
 ; SSE2-NEXT:  .LBB87_6:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 8(%rdi), %rcx
+; SSE2-NEXT:    testq %rcx, %rcx
 ; SSE2-NEXT:    js .LBB87_7
 ; SSE2-NEXT:  # %bb.8:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB87_9
-; SSE2-NEXT:  .LBB87_7:
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm3
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    jns .LBB87_11
+; SSE2-NEXT:  .LBB87_10:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB87_9:
-; SSE2-NEXT:    movq 48(%rdi), %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE2-NEXT:    movq %xmm3, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
-; SSE2-NEXT:    js .LBB87_10
-; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm4
 ; SSE2-NEXT:    jmp .LBB87_12
-; SSE2-NEXT:  .LBB87_10:
+; SSE2-NEXT:  .LBB87_7:
 ; SSE2-NEXT:    movq %rcx, %rdx
 ; SSE2-NEXT:    shrq %rdx
 ; SSE2-NEXT:    andl $1, %ecx
 ; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm4
-; SSE2-NEXT:    addss %xmm4, %xmm4
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm3
+; SSE2-NEXT:    addss %xmm3, %xmm3
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB87_10
+; SSE2-NEXT:  .LBB87_11:
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
 ; SSE2-NEXT:  .LBB87_12:
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm5
+; SSE2-NEXT:    movq 56(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_13
 ; SSE2-NEXT:  # %bb.14:
-; SSE2-NEXT:    xorps %xmm3, %xmm3
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
 ; SSE2-NEXT:    jmp .LBB87_15
 ; SSE2-NEXT:  .LBB87_13:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm3, %xmm3
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE2-NEXT:    addss %xmm3, %xmm3
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
+; SSE2-NEXT:    addss %xmm5, %xmm5
 ; SSE2-NEXT:  .LBB87_15:
-; SSE2-NEXT:    movq 32(%rdi), %rax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; SSE2-NEXT:    movq %xmm5, %rcx
-; SSE2-NEXT:    testq %rcx, %rcx
+; SSE2-NEXT:    movq 48(%rdi), %rax
+; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_16
 ; SSE2-NEXT:  # %bb.17:
-; SSE2-NEXT:    xorps %xmm5, %xmm5
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
 ; SSE2-NEXT:    jmp .LBB87_18
 ; SSE2-NEXT:  .LBB87_16:
-; SSE2-NEXT:    movq %rcx, %rdx
-; SSE2-NEXT:    shrq %rdx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    xorps %xmm5, %xmm5
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm5
-; SSE2-NEXT:    addss %xmm5, %xmm5
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andl $1, %eax
+; SSE2-NEXT:    orq %rcx, %rax
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
+; SSE2-NEXT:    addss %xmm4, %xmm4
 ; SSE2-NEXT:  .LBB87_18:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm4
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    movq 40(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_19
 ; SSE2-NEXT:  # %bb.20:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
 ; SSE2-NEXT:    jmp .LBB87_21
 ; SSE2-NEXT:  .LBB87_19:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    addss %xmm2, %xmm2
 ; SSE2-NEXT:  .LBB87_21:
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    movq 32(%rdi), %rax
 ; SSE2-NEXT:    testq %rax, %rax
 ; SSE2-NEXT:    js .LBB87_22
 ; SSE2-NEXT:  # %bb.23:
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
 ; SSE2-NEXT:    jmp .LBB87_24
 ; SSE2-NEXT:  .LBB87_22:
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shrq %rcx
 ; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
+; SSE2-NEXT:    addss %xmm1, %xmm1
 ; SSE2-NEXT:  .LBB87_24:
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_load_8i64_to_8f32:

>From 5d3ef06509c2f1fc5384fa64e5848d12f7b8811e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 27 Mar 2024 17:15:48 +0000
Subject: [PATCH 52/54] [X86] combine-pavg.ll - add demandedelts test coverage
 for #86284

---
 llvm/test/CodeGen/X86/combine-pavg.ll | 30 +++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll
index 9bb7fec7eeacbe..7a8ddf5178d3d8 100644
--- a/llvm/test/CodeGen/X86/combine-pavg.ll
+++ b/llvm/test/CodeGen/X86/combine-pavg.ll
@@ -80,3 +80,33 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
   %trunc = trunc <16 x i16> %shuffle to <16 x i8>
   ret <16 x i8> %trunc
 }
+
+define <8 x i16> @combine_pavgw_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: combine_pavgw_demandedelts:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,12,13,12,13]
+; SSE-NEXT:    pavgw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_pavgw_demandedelts:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,12,13,12,13]
+; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_pavgw_demandedelts:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  %avg = tail call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %s0, <8 x i16> %a1)
+  %shuffle = shufflevector <8 x i16> %avg, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %shuffle
+}
+

>From 35d55f2894a2a2cdca5db494f519aa5ec7273678 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <justinfargnoli at gmail.com>
Date: Wed, 27 Mar 2024 10:30:17 -0700
Subject: [PATCH 53/54] [NFC][mlir] Reorder `declarePromisedInterface()`
 operands (#86628)

Reorder the template operands of `declarePromisedInterface()` to match
`declarePromisedInterfaces()`.
---
 mlir/include/mlir/IR/Dialect.h                     |  4 ++--
 mlir/lib/Dialect/Arith/IR/ArithDialect.cpp         |  6 +++---
 mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp     |  2 +-
 mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp |  6 +++---
 mlir/lib/Dialect/Func/IR/FuncOps.cpp               |  4 ++--
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp             |  4 ++--
 mlir/lib/Dialect/Index/IR/IndexDialect.cpp         |  2 +-
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp         |  4 ++--
 mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp        |  2 +-
 mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp       | 12 ++++++------
 mlir/lib/Dialect/Math/IR/MathDialect.cpp           |  2 +-
 mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp       |  4 ++--
 mlir/lib/Dialect/SCF/IR/SCF.cpp                    |  2 +-
 mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp         |  2 +-
 mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp       |  2 +-
 mlir/lib/Dialect/UB/IR/UBOps.cpp                   |  2 +-
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp           |  4 ++--
 mlir/unittests/IR/InterfaceAttachmentTest.cpp      |  4 ++--
 18 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
index 6c8a170a03c72d..f7c1f4df16fc48 100644
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -210,7 +210,7 @@ class Dialect {
   /// registration. The promised interface type can be an interface of any type
   /// not just a dialect interface, i.e. it may also be an
   /// AttributeInterface/OpInterface/TypeInterface/etc.
-  template <typename ConcreteT, typename InterfaceT>
+  template <typename InterfaceT, typename ConcreteT>
   void declarePromisedInterface() {
     unresolvedPromisedInterfaces.insert(
         {TypeID::get<ConcreteT>(), InterfaceT::getInterfaceID()});
@@ -221,7 +221,7 @@ class Dialect {
   // declarePromisedInterfaces<FunctionOpInterface, MyFuncType1, MyFuncType2>()
   template <typename InterfaceT, typename... ConcreteT>
   void declarePromisedInterfaces() {
-    (declarePromisedInterface<ConcreteT, InterfaceT>(), ...);
+    (declarePromisedInterface<InterfaceT, ConcreteT>(), ...);
   }
 
   /// Checks if the given interface, which is attempting to be used, is a
diff --git a/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp b/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
index 6a593185ccedce..042acf61009000 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
@@ -48,9 +48,9 @@ void arith::ArithDialect::initialize() {
 #include "mlir/Dialect/Arith/IR/ArithOpsAttributes.cpp.inc"
       >();
   addInterfaces<ArithInlinerInterface>();
-  declarePromisedInterface<ArithDialect, ConvertToLLVMPatternInterface>();
-  declarePromisedInterface<SelectOp,
-                           bufferization::BufferDeallocationOpInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, ArithDialect>();
+  declarePromisedInterface<bufferization::BufferDeallocationOpInterface,
+                           SelectOp>();
   declarePromisedInterfaces<bufferization::BufferizableOpInterface, ConstantOp,
                             IndexCastOp, SelectOp>();
   declarePromisedInterfaces<ValueBoundsOpInterface, AddIOp, ConstantOp, SubIOp,
diff --git a/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp b/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
index ca57171af156f9..0bdcf434e062fe 100644
--- a/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
+++ b/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
@@ -40,7 +40,7 @@ void complex::ComplexDialect::initialize() {
 #define GET_ATTRDEF_LIST
 #include "mlir/Dialect/Complex/IR/ComplexAttributes.cpp.inc"
       >();
-  declarePromisedInterface<ComplexDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, ComplexDialect>();
   addInterfaces<ComplexInlinerInterface>();
 }
 
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index c6b02b9703e75f..5d11f8f6cc458b 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -70,11 +70,11 @@ void ControlFlowDialect::initialize() {
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.cpp.inc"
       >();
   addInterfaces<ControlFlowInlinerInterface>();
-  declarePromisedInterface<ControlFlowDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, ControlFlowDialect>();
   declarePromisedInterfaces<bufferization::BufferizableOpInterface, BranchOp,
                             CondBranchOp>();
-  declarePromisedInterface<CondBranchOp,
-                           bufferization::BufferDeallocationOpInterface>();
+  declarePromisedInterface<bufferization::BufferDeallocationOpInterface,
+                           CondBranchOp>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Func/IR/FuncOps.cpp b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
index ed2ecfe9d0fb51..95589e8989e27a 100644
--- a/mlir/lib/Dialect/Func/IR/FuncOps.cpp
+++ b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
@@ -42,8 +42,8 @@ void FuncDialect::initialize() {
 #define GET_OP_LIST
 #include "mlir/Dialect/Func/IR/FuncOps.cpp.inc"
       >();
-  declarePromisedInterface<FuncDialect, DialectInlinerInterface>();
-  declarePromisedInterface<FuncDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<DialectInlinerInterface, FuncDialect>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, FuncDialect>();
   declarePromisedInterfaces<bufferization::BufferizableOpInterface, CallOp,
                             FuncOp, ReturnOp>();
 }
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index a02eca8b11790c..f1b9ca5c500208 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -216,8 +216,8 @@ void GPUDialect::initialize() {
 #include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc"
       >();
   addInterfaces<GPUInlinerInterface>();
-  declarePromisedInterface<TerminatorOp,
-                           bufferization::BufferDeallocationOpInterface>();
+  declarePromisedInterface<bufferization::BufferDeallocationOpInterface,
+                           TerminatorOp>();
 }
 
 static std::string getSparseHandleKeyword(SparseHandleKind kind) {
diff --git a/mlir/lib/Dialect/Index/IR/IndexDialect.cpp b/mlir/lib/Dialect/Index/IR/IndexDialect.cpp
index d631afa63b9a2a..183d0e33b2523b 100644
--- a/mlir/lib/Dialect/Index/IR/IndexDialect.cpp
+++ b/mlir/lib/Dialect/Index/IR/IndexDialect.cpp
@@ -19,7 +19,7 @@ using namespace mlir::index;
 void IndexDialect::initialize() {
   registerAttributes();
   registerOperations();
-  declarePromisedInterface<IndexDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, IndexDialect>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 9e8407451a0855..94197e473ce012 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1044,8 +1044,8 @@ void NVVMDialect::initialize() {
   // Support unknown operations because not all NVVM operations are
   // registered.
   allowUnknownOperations();
-  declarePromisedInterface<NVVMDialect, ConvertToLLVMPatternInterface>();
-  declarePromisedInterface<NVVMTargetAttr, gpu::TargetAttrInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, NVVMDialect>();
+  declarePromisedInterface<gpu::TargetAttrInterface, NVVMTargetAttr>();
 }
 
 LogicalResult NVVMDialect::verifyOperationAttribute(Operation *op,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
index 0f2e75cd7e8bc7..65b770ae326106 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
@@ -247,7 +247,7 @@ void ROCDLDialect::initialize() {
 
   // Support unknown operations because not all ROCDL operations are registered.
   allowUnknownOperations();
-  declarePromisedInterface<ROCDLTargetAttr, gpu::TargetAttrInterface>();
+  declarePromisedInterface<gpu::TargetAttrInterface, ROCDLTargetAttr>();
 }
 
 LogicalResult ROCDLDialect::verifyOperationAttribute(Operation *op,
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
index a6936fde43709d..9e50c355c50417 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
@@ -123,16 +123,16 @@ void mlir::linalg::LinalgDialect::initialize() {
 
   addInterfaces<LinalgInlinerInterface>();
 
-  declarePromisedInterface<GenericOp, mesh::ShardingInterface>();
+  declarePromisedInterface<mesh::ShardingInterface, GenericOp>();
   declarePromisedInterfaces<mesh::ShardingInterface,
 #define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
                             >();
-  declarePromisedInterface<CopyOp, SubsetOpInterface>();
-  declarePromisedInterface<CopyOp, SubsetInsertionOpInterface>();
-  declarePromisedInterface<IndexOp, ValueBoundsOpInterface>();
-  declarePromisedInterface<linalg::GenericOp, TilingInterface>();
-  declarePromisedInterface<linalg::GenericOp, PartialReductionOpInterface>();
+  declarePromisedInterface<SubsetOpInterface, CopyOp>();
+  declarePromisedInterface<SubsetInsertionOpInterface, CopyOp>();
+  declarePromisedInterface<ValueBoundsOpInterface, IndexOp>();
+  declarePromisedInterface<TilingInterface, linalg::GenericOp>();
+  declarePromisedInterface<PartialReductionOpInterface, linalg::GenericOp>();
   declarePromisedInterfaces<TilingInterface,
 #define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
diff --git a/mlir/lib/Dialect/Math/IR/MathDialect.cpp b/mlir/lib/Dialect/Math/IR/MathDialect.cpp
index a71b24cb1b9737..285b5ca5940500 100644
--- a/mlir/lib/Dialect/Math/IR/MathDialect.cpp
+++ b/mlir/lib/Dialect/Math/IR/MathDialect.cpp
@@ -35,5 +35,5 @@ void mlir::math::MathDialect::initialize() {
 #include "mlir/Dialect/Math/IR/MathOps.cpp.inc"
       >();
   addInterfaces<MathInlinerInterface>();
-  declarePromisedInterface<MathDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, MathDialect>();
 }
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
index 41082a85a485f2..3a8bd12ba25863 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
@@ -47,14 +47,14 @@ void mlir::memref::MemRefDialect::initialize() {
 #include "mlir/Dialect/MemRef/IR/MemRefOps.cpp.inc"
       >();
   addInterfaces<MemRefInlinerInterface>();
-  declarePromisedInterface<MemRefDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, MemRefDialect>();
   declarePromisedInterfaces<bufferization::AllocationOpInterface, AllocOp,
                             AllocaOp, ReallocOp>();
   declarePromisedInterfaces<RuntimeVerifiableOpInterface, CastOp, ExpandShapeOp,
                             LoadOp, ReinterpretCastOp, StoreOp, SubViewOp>();
   declarePromisedInterfaces<ValueBoundsOpInterface, AllocOp, AllocaOp, CastOp,
                             DimOp, GetGlobalOp, RankOp, SubViewOp>();
-  declarePromisedInterface<MemRefType, DestructurableTypeInterface>();
+  declarePromisedInterface<DestructurableTypeInterface, MemRefType>();
 }
 
 /// Finds the unique dealloc operation (if one exists) for `allocValue`.
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index ddb9676eb4f628..5bca8e85f889d9 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -79,7 +79,7 @@ void SCFDialect::initialize() {
   declarePromisedInterfaces<bufferization::BufferizableOpInterface, ConditionOp,
                             ExecuteRegionOp, ForOp, IfOp, IndexSwitchOp,
                             ForallOp, InParallelOp, WhileOp, YieldOp>();
-  declarePromisedInterface<ForOp, ValueBoundsOpInterface>();
+  declarePromisedInterface<ValueBoundsOpInterface, ForOp>();
 }
 
 /// Default callback for IfOp builders. Inserts a yield without arguments.
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
index e914f46bdef643..72488d6e5d0b09 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
@@ -135,7 +135,7 @@ void SPIRVDialect::initialize() {
 
   // Allow unknown operations because SPIR-V is extensible.
   allowUnknownOperations();
-  declarePromisedInterface<TargetEnvAttr, gpu::TargetAttrInterface>();
+  declarePromisedInterface<gpu::TargetAttrInterface, TargetEnvAttr>();
 }
 
 std::string SPIRVDialect::getAttributeName(Decoration decoration) {
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
index 4b3156728cc979..002077753b1324 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
@@ -62,7 +62,7 @@ void TensorDialect::initialize() {
                             ParallelInsertSliceOp>();
   declarePromisedInterfaces<SubsetInsertionOpInterface, InsertSliceOp,
                             ParallelInsertSliceOp>();
-  declarePromisedInterface<ExtractSliceOp, SubsetExtractionOpInterface>();
+  declarePromisedInterface<SubsetExtractionOpInterface, ExtractSliceOp>();
   declarePromisedInterfaces<TilingInterface, PadOp, PackOp, UnPackOp>();
   declarePromisedInterfaces<ValueBoundsOpInterface, CastOp, DimOp, EmptyOp,
                             ExtractSliceOp, PadOp, RankOp>();
diff --git a/mlir/lib/Dialect/UB/IR/UBOps.cpp b/mlir/lib/Dialect/UB/IR/UBOps.cpp
index 3a2010cdcb5c7c..5b2cfe7bf42642 100644
--- a/mlir/lib/Dialect/UB/IR/UBOps.cpp
+++ b/mlir/lib/Dialect/UB/IR/UBOps.cpp
@@ -46,7 +46,7 @@ void UBDialect::initialize() {
 #include "mlir/Dialect/UB/IR/UBOpsAttributes.cpp.inc"
       >();
   addInterfaces<UBInlinerInterface>();
-  declarePromisedInterface<UBDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<ConvertToLLVMPatternInterface, UBDialect>();
 }
 
 Operation *UBDialect::materializeConstant(OpBuilder &builder, Attribute value,
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 35296824246eb6..e566bfacf37984 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -382,8 +382,8 @@ void VectorDialect::initialize() {
                             YieldOp>();
   declarePromisedInterfaces<SubsetOpInterface, TransferReadOp,
                             TransferWriteOp>();
-  declarePromisedInterface<TransferReadOp, SubsetExtractionOpInterface>();
-  declarePromisedInterface<TransferWriteOp, SubsetInsertionOpInterface>();
+  declarePromisedInterface<SubsetExtractionOpInterface, TransferReadOp>();
+  declarePromisedInterface<SubsetInsertionOpInterface, TransferWriteOp>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
diff --git a/mlir/unittests/IR/InterfaceAttachmentTest.cpp b/mlir/unittests/IR/InterfaceAttachmentTest.cpp
index 16de34c45ec6e0..58049a9969e3ab 100644
--- a/mlir/unittests/IR/InterfaceAttachmentTest.cpp
+++ b/mlir/unittests/IR/InterfaceAttachmentTest.cpp
@@ -431,8 +431,8 @@ TEST(InterfaceAttachmentTest, PromisedInterfaces) {
       attr.hasPromiseOrImplementsInterface<TestExternalAttrInterface>());
 
   // Add a promise `TestExternalAttrInterface`.
-  testDialect->declarePromisedInterface<test::SimpleAAttr,
-                                        TestExternalAttrInterface>();
+  testDialect->declarePromisedInterface<TestExternalAttrInterface,
+                                        test::SimpleAAttr>();
   EXPECT_TRUE(
       attr.hasPromiseOrImplementsInterface<TestExternalAttrInterface>());
 

>From 96b3969a4d9e8faa3dd9b7e8b2696e2684cdebef Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Wed, 27 Mar 2024 10:30:58 -0700
Subject: [PATCH 54/54] [NFC][HWASAN] Precommit globals-access test

HWASAN does not behave as expected yet.

Reviewers: fmayer, thurstond

Reviewed By: fmayer, thurstond

Pull Request: https://github.com/llvm/llvm-project/pull/86771
---
 .../HWAddressSanitizer/globals-access.ll      | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/globals-access.ll

diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/globals-access.ll b/llvm/test/Instrumentation/HWAddressSanitizer/globals-access.ll
new file mode 100644
index 00000000000000..c83911f60149a7
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/globals-access.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --global-value-regex "x" --version 4
+; RUN: opt < %s -S -passes=hwasan -mtriple=aarch64 -hwasan-globals=0 | FileCheck %s --check-prefixes=NOGLOB
+; RUN: opt < %s -S -passes=hwasan -mtriple=aarch64 -hwasan-globals=1 | FileCheck %s
+
+ at x = dso_local global i32 0, align 4
+
+;.
+; NOGLOB: @x = dso_local global i32 0, align 4
+;.
+; CHECK: @x = alias i32, inttoptr (i64 add (i64 ptrtoint (ptr @x.hwasan to i64), i64 5260204364768739328) to ptr)
+;.
+define dso_local noundef i32 @_Z3tmpv() sanitize_hwaddress {
+; NOGLOB-LABEL: define dso_local noundef i32 @_Z3tmpv(
+; NOGLOB-SAME: ) #[[ATTR0:[0-9]+]] {
+; NOGLOB-NEXT:  entry:
+; NOGLOB-NEXT:    [[TMP12:%.*]] = load i64, ptr @__hwasan_tls, align 8
+; NOGLOB-NEXT:    [[TMP1:%.*]] = or i64 [[TMP12]], 4294967295
+; NOGLOB-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP1]], 1
+; NOGLOB-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
+; NOGLOB-NEXT:    [[TMP3:%.*]] = lshr i64 ptrtoint (ptr @x to i64), 56
+; NOGLOB-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
+; NOGLOB-NEXT:    [[TMP5:%.*]] = and i64 ptrtoint (ptr @x to i64), 72057594037927935
+; NOGLOB-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 4
+; NOGLOB-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP2]], i64 [[TMP6]]
+; NOGLOB-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP7]], align 1
+; NOGLOB-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP4]], [[TMP8]]
+; NOGLOB-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1:![0-9]+]]
+; NOGLOB:       10:
+; NOGLOB-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP2]], ptr @x, i32 2)
+; NOGLOB-NEXT:    br label [[TMP11]]
+; NOGLOB:       11:
+; NOGLOB-NEXT:    [[TMP0:%.*]] = load i32, ptr @x, align 4
+; NOGLOB-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-LABEL: define dso_local noundef i32 @_Z3tmpv(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__hwasan_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[TMP12]], 4294967295
+; CHECK-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 ptrtoint (ptr @x to i64), 56
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 ptrtoint (ptr @x to i64), 72057594037927935
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP2]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP2]], ptr @x, i32 2)
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @x, align 4
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %0 = load i32, ptr @x, align 4
+  ret i32 %0
+}



More information about the llvm-branch-commits mailing list