[clang] [libcxx] [llvm] [libcxx] renames `__split_buffer` alias template to `_SplitBuffer` (PR #180284)

Fri Feb 6 13:47:06 PST 2026

https://github.com/cjdb updated https://github.com/llvm/llvm-project/pull/180284

>From f79e8b9a0fa15c3ef72624ff70aa24f8e4cec2d5 Mon Sep 17 00:00:00 2001
From: Christopher Di Bella <cjdb at google.com>
Date: Fri, 6 Feb 2026 21:16:11 +0000
Subject: [PATCH 01/13] [libcxx] renames `__split_buffer` alias template to
 `_SplitBuffer`

`-Wchanges-meaning` is a GCC warning that catches shadowing in more
contexts. While a bit annoying here, it's a helpful warning. As such,
we need to rename the `__split_buffer` alias template in `std::vector`
so that we don't trip it up.
---
 libcxx/include/__vector/vector.h | 46 +++++++++++++++-----------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 9747575bedafc..37e46bf30fc6a 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -86,7 +86,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, class _Allocator /* = allocator<_Tp> */>
 class vector {
   template <class _Up, class _Alloc>
-  using __split_buffer _LIBCPP_NODEBUG = std::__split_buffer<_Up, _Alloc, __split_buffer_pointer_layout>;
+  using _SplitBuffer _LIBCPP_NODEBUG = std::__split_buffer<_Up, _Alloc, __split_buffer_pointer_layout>;
 
 public:
   //
@@ -487,7 +487,7 @@ class vector {
       if (__len < __cap_ - __end_) {
         __construct_at_end(ranges::begin(__range), ranges::end(__range), __len);
       } else {
-        __split_buffer<value_type, allocator_type> __buffer(__recommend(size() + __len), size(), __alloc_);
+        _SplitBuffer __buffer(__recommend(size() + __len), size(), __alloc_);
         __buffer.__construct_at_end_with_size(ranges::begin(__range), __len);
         __swap_out_circular_buffer(__buffer);
       }
@@ -698,10 +698,9 @@ class vector {
 #endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
-  __swap_out_circular_buffer(__split_buffer<value_type, allocator_type>& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_out_circular_buffer(_SplitBuffer& __v);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer
-  __swap_out_circular_buffer(__split_buffer<value_type, allocator_type>& __v, pointer __p);
+  __swap_out_circular_buffer(_SplitBuffer& __v, pointer __p);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
   __move_range(pointer __from_s, pointer __from_e, pointer __to);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign(vector& __c, true_type)
@@ -808,21 +807,21 @@ class vector {
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector&, false_type) _NOEXCEPT {}
 
   template <class _Ptr = pointer, __enable_if_t<is_pointer<_Ptr>::value, int> = 0>
-  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI _Ptr
+  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI pointer
   __add_alignment_assumption(_Ptr __p) _NOEXCEPT {
     if (!__libcpp_is_constant_evaluated()) {
-      return static_cast<_Ptr>(__builtin_assume_aligned(__p, _LIBCPP_ALIGNOF(decltype(*__p))));
+      return static_cast<pointer>(__builtin_assume_aligned(__p, _LIBCPP_ALIGNOF(decltype(*__p))));
     }
     return __p;
   }
 
   template <class _Ptr = pointer, __enable_if_t<!is_pointer<_Ptr>::value, int> = 0>
-  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI _Ptr
+  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI pointer
   __add_alignment_assumption(_Ptr __p) _NOEXCEPT {
     return __p;
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_layouts(__split_buffer<_Tp, allocator_type>& __sb) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __swap_layouts(_SplitBuffer& __sb) {
     auto __vector_begin    = __begin_;
     auto __vector_sentinel = __end_;
     auto __vector_cap      = __cap_;
@@ -866,8 +865,7 @@ vector(from_range_t, _Range&&, _Alloc = _Alloc()) -> vector<ranges::range_value_
 // *this and __v. It is assumed that __v provides space for exactly (__end_ - __begin_) objects in the front. This
 // function has a strong exception guarantee.
 template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void
-vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, allocator_type>& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__swap_out_circular_buffer(_SplitBuffer& __v) {
   __annotate_delete();
   auto __new_begin = __v.begin() - size();
   std::__uninitialized_allocator_relocate(
@@ -886,7 +884,7 @@ vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, a
 // function has a strong exception guarantee if __begin_ == __p || __end_ == __p.
 template <class _Tp, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer
-vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer<value_type, allocator_type>& __v, pointer __p) {
+vector<_Tp, _Allocator>::__swap_out_circular_buffer(_SplitBuffer& __v, pointer __p) {
   __annotate_delete();
   pointer __ret = __v.begin();
 
@@ -1086,7 +1084,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::reserve(size_type __
   if (__n > capacity()) {
     if (__n > max_size())
       this->__throw_length_error();
-    __split_buffer<value_type, allocator_type> __v(__n, size(), this->__alloc_);
+    _SplitBuffer __v(__n, size(), this->__alloc_);
     __swap_out_circular_buffer(__v);
   }
 }
@@ -1097,7 +1095,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::shrink_to_fit() _NOE
 #if _LIBCPP_HAS_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_EXCEPTIONS
-      __split_buffer<value_type, allocator_type> __v(size(), size(), this->__alloc_);
+      _SplitBuffer __v(size(), size(), this->__alloc_);
       // The Standard mandates shrink_to_fit() does not increase the capacity.
       // With equal capacity keep the existing buffer. This avoids extra work
       // due to swapping the elements.
@@ -1114,7 +1112,7 @@ template <class _Tp, class _Allocator>
 template <class... _Args>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer
 vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args) {
-  __split_buffer<value_type, allocator_type> __v(__recommend(size() + 1), size(), this->__alloc_);
+  _SplitBuffer __v(__recommend(size() + 1), size(), this->__alloc_);
   //    __v.emplace_back(std::forward<_Args>(__args)...);
   pointer __end = __v.end();
   __alloc_traits::construct(this->__alloc_, std::__to_address(__end), std::forward<_Args>(__args)...);
@@ -1217,7 +1215,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, const_reference __x)
       *__p = *__xr;
     }
   } else {
-    __split_buffer<value_type, allocator_type> __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
+    _SplitBuffer __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
     __v.emplace_back(__x);
     __p = __swap_out_circular_buffer(__v, __p);
   }
@@ -1236,7 +1234,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, value_type&& __x) {
       *__p = std::move(__x);
     }
   } else {
-    __split_buffer<value_type, allocator_type> __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
+    _SplitBuffer __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
     __v.emplace_back(std::move(__x));
     __p = __swap_out_circular_buffer(__v, __p);
   }
@@ -1257,7 +1255,7 @@ vector<_Tp, _Allocator>::emplace(const_iterator __position, _Args&&... __args) {
       *__p = std::move(__tmp.get());
     }
   } else {
-    __split_buffer<value_type, allocator_type> __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
+    _SplitBuffer __v(__recommend(size() + 1), __p - this->__begin_, this->__alloc_);
     __v.emplace_back(std::forward<_Args>(__args)...);
     __p = __swap_out_circular_buffer(__v, __p);
   }
@@ -1285,7 +1283,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, size_type __n, const_
         std::fill_n(__p, __n, *__xr);
       }
     } else {
-      __split_buffer<value_type, allocator_type> __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_);
+      _SplitBuffer __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_);
       __v.__construct_at_end(__n, __x);
       __p = __swap_out_circular_buffer(__v, __p);
     }
@@ -1306,11 +1304,11 @@ vector<_Tp, _Allocator>::__insert_with_sentinel(const_iterator __position, _Inpu
   if (__first == __last)
     (void)std::rotate(__p, __old_last, this->__end_);
   else {
-    __split_buffer<value_type, allocator_type> __v(__alloc_);
+    SplitBuffer __v(__alloc_);
     auto __guard = std::__make_exception_guard(
         _AllocatorDestroyRangeReverse<allocator_type, pointer>(__alloc_, __old_last, this->__end_));
     __v.__construct_at_end_with_sentinel(std::move(__first), std::move(__last));
-    __split_buffer<value_type, allocator_type> __merged(
+    _SplitBuffer __merged(
         __recommend(size() + __v.size()), __off, __alloc_); // has `__off` positions available at the front
     std::__uninitialized_allocator_relocate(
         __alloc_, std::__to_address(__old_last), std::__to_address(this->__end_), std::__to_address(__merged.end()));
@@ -1356,7 +1354,7 @@ vector<_Tp, _Allocator>::__insert_with_size(
         __insert_assign_n_unchecked<_AlgPolicy>(std::move(__first), __n, __p);
       }
     } else {
-      __split_buffer<value_type, allocator_type> __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_);
+      _SplitBuffer __v(__recommend(size() + __n), __p - this->__begin_, this->__alloc_);
       __v.__construct_at_end_with_size(std::move(__first), __n);
       __p = __swap_out_circular_buffer(__v, __p);
     }
@@ -1371,7 +1369,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __n
     if (__new_size <= capacity()) {
       __construct_at_end(__new_size - __current_size);
     } else {
-      __split_buffer<value_type, allocator_type> __v(__recommend(__new_size), __current_size, __alloc_);
+      _SplitBuffer __v(__recommend(__new_size), __current_size, __alloc_);
       __v.__construct_at_end(__new_size - __current_size);
       __swap_out_circular_buffer(__v);
     }
@@ -1387,7 +1385,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __n
     if (__new_size <= capacity())
       __construct_at_end(__new_size - __current_size, __x);
     else {
-      __split_buffer<value_type, allocator_type> __v(__recommend(__new_size), __current_size, __alloc_);
+      _SplitBuffer __v(__recommend(__new_size), __current_size, __alloc_);
       __v.__construct_at_end(__new_size - __current_size, __x);
       __swap_out_circular_buffer(__v);
     }

>From c34cb4a6e10b374c15c61dbce2133f3434b4960a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 6 Feb 2026 21:14:33 +0000
Subject: [PATCH 02/13] Reapply "[SCEVExp] Use SCEVPtrToAddr in
 tryToReuseLCSSAPhi if possible. (#180257)"

This reverts commit cb905605b2e95f88296afe136b21a7d2476cb058.

Recommit the patch with a small change to check the destination
type matches the address type, to avoid a crash on mismatch.

Original message:

This patch updates tryToReuseLCSSAPhi to use SCEVPtrToAddr, unless using
SCEVPtrToInt allows re-use, because the IR already contains a re-usable
phi using PtrToInt.

This is a first step towards migrating to SCEVPtrToAddr and avoids
regressions in follow-up changes.

PR: https://github.com/llvm/llvm-project/pull/178727
---
 .../Analysis/ScalarEvolutionPatternMatch.h    |  6 ++
 .../Utils/ScalarEvolutionExpander.cpp         | 46 ++++++++-----
 .../reuse-lcssa-phi-scev-expansion.ll         | 69 ++++++++++++++++++-
 .../reuse-lcssa-phi-scev-expansion.ll         |  6 +-
 4 files changed, 106 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index f285eacc4c565..7b00d0109a68c 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -186,6 +186,12 @@ m_scev_PtrToInt(const Op0_t &Op0) {
   return SCEVUnaryExpr_match<SCEVPtrToIntExpr, Op0_t>(Op0);
 }
 
+template <typename Op0_t>
+inline SCEVUnaryExpr_match<SCEVPtrToAddrExpr, Op0_t>
+m_scev_PtrToAddr(const Op0_t &Op0) {
+  return SCEVUnaryExpr_match<SCEVPtrToAddrExpr, Op0_t>(Op0);
+}
+
 template <typename Op0_t>
 inline SCEVUnaryExpr_match<SCEVTruncateExpr, Op0_t>
 m_scev_Trunc(const Op0_t &Op0) {
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index cccb944618e07..84562ecfcffe8 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1254,6 +1254,22 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
       !SE.DT.dominates(EB, Builder.GetInsertBlock()))
     return nullptr;
 
+  // Helper to check if the diff between S and ExitSCEV is simple enough to
+  // allow reusing the LCSSA phi.
+  auto CanReuse = [&](const SCEV *ExitSCEV) -> const SCEV * {
+    if (isa<SCEVCouldNotCompute>(ExitSCEV))
+      return nullptr;
+    const SCEV *Diff = SE.getMinusSCEV(S, ExitSCEV);
+    const SCEV *Op = Diff;
+    match(Op, m_scev_Add(m_SCEVConstant(), m_SCEV(Op)));
+    match(Op, m_scev_Mul(m_scev_AllOnes(), m_SCEV(Op)));
+    match(Op, m_scev_PtrToAddr(m_SCEV(Op))) ||
+        match(Op, m_scev_PtrToInt(m_SCEV(Op)));
+    if (!isa<SCEVConstant, SCEVUnknown>(Op))
+      return nullptr;
+    return Diff;
+  };
+
   for (auto &PN : EB->phis()) {
     if (!SE.isSCEVable(PN.getType()))
       continue;
@@ -1261,22 +1277,20 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
     if (!isa<SCEVAddRecExpr>(ExitSCEV))
       continue;
     Type *PhiTy = PN.getType();
-    if (STy->isIntegerTy() && PhiTy->isPointerTy()) {
-      ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy);
-      if (isa<SCEVCouldNotCompute>(ExitSCEV))
-        continue;
-    } else if (S->getType() != PN.getType()) {
-      continue;
+    const SCEV *Diff = nullptr;
+    if (STy->isIntegerTy() && PhiTy->isPointerTy() &&
+        DL.getAddressType(PhiTy) == STy) {
+      // Prefer ptrtoaddr over ptrtoint.
+      const SCEV *AddrSCEV = SE.getPtrToAddrExpr(ExitSCEV);
+      Diff = CanReuse(AddrSCEV);
+      if (!Diff) {
+        const SCEV *IntSCEV = SE.getPtrToIntExpr(ExitSCEV, STy);
+        Diff = CanReuse(IntSCEV);
+      }
+    } else if (STy == PhiTy) {
+      Diff = CanReuse(ExitSCEV);
     }
-
-    // Check if we can re-use the existing PN, by adjusting it with an expanded
-    // offset, if the offset is simpler.
-    const SCEV *Diff = SE.getMinusSCEV(S, ExitSCEV);
-    const SCEV *Op = Diff;
-    match(Op, m_scev_Add(m_SCEVConstant(), m_SCEV(Op)));
-    match(Op, m_scev_Mul(m_scev_AllOnes(), m_SCEV(Op)));
-    match(Op, m_scev_PtrToInt(m_SCEV(Op)));
-    if (!isa<SCEVConstant, SCEVUnknown>(Op))
+    if (!Diff)
       continue;
 
     assert(Diff->getType()->isIntegerTy() &&
@@ -1286,7 +1300,7 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
     if (PhiTy->isPointerTy()) {
       if (STy->isPointerTy())
         return Builder.CreatePtrAdd(BaseV, DiffV);
-      BaseV = Builder.CreatePtrToInt(BaseV, DiffV->getType());
+      BaseV = Builder.CreatePtrToAddr(BaseV);
     }
     return Builder.CreateAdd(BaseV, DiffV);
   }
diff --git a/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll
index a15db620e0082..d0e70c21c7bc6 100644
--- a/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll
@@ -10,7 +10,7 @@ define void @scev_expand_ptrtoint(i8 %x, ptr %start) {
 ; CHECK-LABEL: define void @scev_expand_ptrtoint(
 ; CHECK-SAME: i8 [[X:%.*]], ptr [[START:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[START1:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[START1:%.*]] = ptrtoaddr ptr [[START]] to i64
 ; CHECK-NEXT:    br label %[[LOOP_1_HEADER:.*]]
 ; CHECK:       [[LOOP_1_HEADER]]:
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1_LATCH:.*]] ]
@@ -36,7 +36,7 @@ define void @scev_expand_ptrtoint(i8 %x, ptr %start) {
 ; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], %[[LOOP_2_HEADER]] ], [ [[INDVAR]], %[[LOOP_2_HEADER]] ]
 ; CHECK-NEXT:    [[PTR_IV_2_LCSSA:%.*]] = phi ptr [ [[PTR_IV_2]], %[[LOOP_2_HEADER]] ], [ [[PTR_IV_2]], %[[LOOP_2_HEADER]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 1, [[START1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR_IV_1_LCSSA]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoaddr ptr [[PTR_IV_1_LCSSA]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[CMP_EXT]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDVAR_LCSSA]], [[TMP4]]
@@ -222,3 +222,68 @@ loop.2.latch:
 exit:
   ret void
 }
+
+define void @expand_truncated_ptrtoint(ptr %A, ptr %B) {
+; CHECK-LABEL: define void @expand_truncated_ptrtoint(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], %[[LOOP_1]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[P_0:%.*]] = phi ptr [ [[A]], %[[ENTRY]] ], [ [[P_0_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[P_0_NEXT]] = getelementptr i8, ptr [[P_0]], i64 -1
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
+; CHECK-NEXT:    br i1 false, label %[[MIDDLE:.*]], label %[[LOOP_1]]
+; CHECK:       [[MIDDLE]]:
+; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i32 [ [[INDVAR]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[P_0_LCSSA:%.*]] = phi ptr [ [[P_0]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[P_0_TO_INT:%.*]] = ptrtoint ptr [[P_0_LCSSA]] to i64
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[P_0_TO_INT]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[TRUNC]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i64 [[TMP0]], -1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[A1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[INDVAR_LCSSA]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[SCEVGEP]], i8 0, i64 [[TMP4]], i1 false)
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[P_1:%.*]] = phi ptr [ [[B]], %[[MIDDLE]] ], [ [[P_1_NEXT:%.*]], %[[LOOP_2]] ]
+; CHECK-NEXT:    [[P_2:%.*]] = phi i32 [ [[TRUNC]], %[[MIDDLE]] ], [ [[P_2_NEXT:%.*]], %[[LOOP_2]] ]
+; CHECK-NEXT:    [[P_1_NEXT]] = getelementptr i8, ptr [[P_1]], i64 -1
+; CHECK-NEXT:    [[P_2_NEXT]] = add i32 [[P_2]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp sgt i32 [[P_2]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_2]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %p.0 = phi ptr [ %A, %entry ], [ %p.0.next, %loop.1 ]
+  %p.0.next = getelementptr i8, ptr %p.0, i64 -1
+  call void @foo()
+  br i1 false, label %middle, label %loop.1
+
+middle:
+  %p.0.to.int = ptrtoint ptr %p.0 to i64
+  %trunc = trunc i64 %p.0.to.int to i32
+  br label %loop.2
+
+loop.2:
+  %p.1 = phi ptr [ %B, %middle ], [ %p.1.next, %loop.2 ]
+  %p.2 = phi i32 [ %trunc, %middle ], [ %p.2.next, %loop.2 ]
+  %p.1.next = getelementptr i8, ptr %p.1, i64 -1
+  store i8 0, ptr %p.1, align 1
+  %p.2.next = add i32 %p.2, -1
+  %ec = icmp sgt i32 %p.2, 0
+  br i1 %ec, label %loop.2, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 557c7e570766c..0b1c3165fe13a 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -118,7 +118,7 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-NEXT:    [[SEL_DST_LCSSA:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[PTR_IV_1_LCSSA]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoaddr ptr [[PTR_IV_1_LCSSA]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[SEL_DST_LCSSA12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -308,7 +308,7 @@ define void @expand_diff_neg_ptrtoint_expr(ptr %src, ptr %start) {
 ; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 0, [[SRC2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoaddr ptr [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP5]], [[TMP0]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
@@ -403,7 +403,7 @@ define void @scev_exp_reuse_const_add(ptr %dst, ptr %src) {
 ; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -2, [[SRC2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR_IV_1_NEXT_LCSSA]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoaddr ptr [[PTR_IV_1_NEXT_LCSSA]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]

>From d0d7f6c21ab58c47db3bcbaf56811748fdc315f6 Mon Sep 17 00:00:00 2001
From: Fateme Hosseini <quic_fhossein at quicinc.com>
Date: Fri, 6 Feb 2026 15:19:34 -0600
Subject: [PATCH 03/13] [Hexagon] Add post-RA live variables analysis (#179531)

This patch adds HexagonLiveVariables, a post-RA liveness analysis for
physical registers, to keep block live-ins/live-outs and operand
kill/dead markers consistent after late Hexagon transforms; it is run
after GenMux in the pre-emit pipeline.

Author: Sergei Larin <slarin at qti.qualcomm.com>
Patch By: Fateme Hosseini <fhossein at qti.qualcomm.com>

Co-authored-by: Sergei Larin <slarin at qti.qualcomm.com>
---
 llvm/lib/Target/Hexagon/CMakeLists.txt        |   1 +
 llvm/lib/Target/Hexagon/Hexagon.h             |   2 +
 .../Target/Hexagon/HexagonLiveVariables.cpp   | 914 ++++++++++++++++++
 .../lib/Target/Hexagon/HexagonLiveVariables.h | 134 +++
 .../Target/Hexagon/HexagonTargetMachine.cpp   |  10 +
 llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll |   6 +-
 .../CodeGen/Hexagon/live-vars/live-outs.ll    |  77 ++
 llvm/test/CodeGen/Hexagon/nbench1.ll          |   4 +-
 .../test/CodeGen/Hexagon/newvaluejump-kill.ll |   3 +-
 .../test/CodeGen/Hexagon/swp-matmul-bitext.ll |   4 +-
 10 files changed, 1147 insertions(+), 8 deletions(-)
 create mode 100644 llvm/lib/Target/Hexagon/HexagonLiveVariables.cpp
 create mode 100644 llvm/lib/Target/Hexagon/HexagonLiveVariables.h
 create mode 100644 llvm/test/CodeGen/Hexagon/live-vars/live-outs.ll

diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index e379625a00c05..bd87b9c894292 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_target(HexagonCodeGen
   HexagonGenMux.cpp
   HexagonGenPredicate.cpp
   HexagonGlobalRegion.cpp
+  HexagonLiveVariables.cpp
   HexagonHardwareLoops.cpp
   HexagonHazardRecognizer.cpp
   HexagonInstrInfo.cpp
diff --git a/llvm/lib/Target/Hexagon/Hexagon.h b/llvm/lib/Target/Hexagon/Hexagon.h
index 422ab20891b94..b3be89abd527b 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/llvm/lib/Target/Hexagon/Hexagon.h
@@ -26,6 +26,7 @@ class Pass;
 extern char &HexagonCopyHoistingID;
 extern char &HexagonExpandCondsetsID;
 extern char &HexagonTfrCleanupID;
+extern char &HexagonLiveVariablesID;
 void initializeHexagonAsmPrinterPass(PassRegistry &);
 void initializeHexagonBitSimplifyPass(PassRegistry &);
 void initializeHexagonBranchRelaxationPass(PassRegistry &);
@@ -41,6 +42,7 @@ void initializeHexagonExpandCondsetsPass(PassRegistry &);
 void initializeHexagonGenMemAbsolutePass(PassRegistry &);
 void initializeHexagonGenMuxPass(PassRegistry &);
 void initializeHexagonHardwareLoopsPass(PassRegistry &);
+void initializeHexagonLiveVariablesPass(PassRegistry &);
 void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
 void initializeHexagonLoopAlignPass(PassRegistry &);
 void initializeHexagonLoopReschedulingPass(PassRegistry &);
diff --git a/llvm/lib/Target/Hexagon/HexagonLiveVariables.cpp b/llvm/lib/Target/Hexagon/HexagonLiveVariables.cpp
new file mode 100644
index 0000000000000..62ce823a58b99
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonLiveVariables.cpp
@@ -0,0 +1,914 @@
+
+//===----------------- HexagonLiveVariables.cpp ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Hexagon Live Variable Analysis
+// This file implements the Hexagon specific LiveVariables analysis pass.
+// This pass recomputes physical register liveness and updates live-ins for
+// non-entry blocks based on use/def information.
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon_live_vars"
+
+#include "HexagonLiveVariables.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+char HexagonLiveVariables::ID = 0;
+char &llvm::HexagonLiveVariablesID = HexagonLiveVariables::ID;
+
+INITIALIZE_PASS(HexagonLiveVariables, "hexagon-live-vars",
+                "Hexagon Live Variable Analysis", false, false)
+
+// TODO: Establish a protocol to handle liveness of predicated instructions.
+// Liveness for predicated instruction is a little convoluted.
+// TODO: In PhysRegDef and PhysRegUse, use a bit vector instead of 126 elems.
+class HexagonLiveVariablesImpl {
+  // Intermediate data structures
+  friend class llvm::HexagonLiveVariables;
+  typedef MachineBasicBlock::const_instr_iterator MICInstIterType;
+
+  MachineFunction *MF;
+
+  MachineRegisterInfo *MRI;
+
+  const TargetRegisterInfo *TRI;
+
+  const HexagonInstrInfo *QII;
+
+  unsigned NumRegs;
+
+  /// PhysRegInfo - Keep track of which instruction was the last def of a
+  /// physical register (possibly after a use). This is purely local to a BB.
+  SmallVector<MachineInstr *, 0> PhysRegDef;
+
+  /// PhysRegInfo - Keep track of which instruction was the last use of a
+  /// physical register (before any def). This is purely local property to a BB.
+  SmallVector<MachineInstr *, 0> PhysRegUse;
+
+  /// MBB -> (Uses, Defs)
+  /// Uses - use before any def in that MBB.
+  /// Defs - def before any uses  in that MBB.
+  MBBUseDef_t MBBUseDefs;
+
+  /// MI -> (Uses, Defs)
+  MIUseDef_t MIUseDefs;
+
+  /// Live-out data for each MBB => U LiveIns (For all Successors of a MBB).
+  DenseMap<const MachineBasicBlock *, BitVector> MBBLiveOuts;
+
+  /// Each MachineBasicBlock is assigned a Distance which is
+  /// an approximation of MBB->size()*INSTR_SIZE+Some offsets.
+  /// This is helpful in quickly finding distance between
+  /// a branch and its target.
+  /// @note A pass which moves instructions should update this.
+  /// @note The data in distance map should be used carefully because
+  /// difference in the distances of two MI might not give relative distances
+  /// between them. The DistanceMap is mainly useful during pullup.
+  DenseMap<const MachineBasicBlock *, unsigned> DistanceMap;
+
+  // Blocks in depth first order
+  SmallVector<MachineBasicBlock *, 16> BlocksDepthFirst;
+
+  /// @brief Constructs use-defs of \p MBB by analyzing each MachineOperand.
+  /// Collects relevant information so that global liveness can be updated.
+  void constructUseDef(MachineBasicBlock *MBB);
+
+  /// Collects used-before-define set of registers.
+  /// A register is considered to be completely defined if
+  /// 1. The register
+  /// 2. Any of its super-reg
+  /// 3. All of its subregs
+  /// are defined. In these cases the register is not considered as
+  /// used-before-defined. In case of partial definition of a register
+  /// before its use, only the remaining subregs are included in the use-set.
+  /// @note: Assumes that a register can be completely defined, by defining
+  /// all of its sub-regs (if any).
+  void handlePhysRegUse(MachineOperand *MO, MachineInstr *MI, BitVector &Uses);
+
+  /// Collects defined-before-use set of registers. If there is any
+  /// use of register or its aliases then the register is not counted
+  /// as defined-before-use
+  /// @note: Assumes that a register can be completely defined, by defining
+  /// all of its sub-regs (if any).
+  void handlePhysRegDef(MachineOperand *MO, MachineInstr *MI, BitVector &Defs);
+
+  /// updateGlobalLiveness - wrapper around another overload
+  inline bool updateGlobalLiveness(MachineFunction &Fn);
+  bool updateGlobalLiveness(MachineBasicBlock *X, MachineBasicBlock *Y);
+
+  /// updateGlobalLiveness - updates liveness based on
+  /// livein and liveout entries.
+  bool updateGlobalLiveness(MachineBasicBlock *MBB, BitVector &Defs,
+                            BitVector &LiveIns);
+
+  /// update live-ins when live-out has been calculated
+  bool updateLiveIns(MachineBasicBlock *MBB, BitVector &LiveIns,
+                     const BitVector &LiveOuts);
+
+  bool updateLiveOuts(MachineBasicBlock *MBB, BitVector &LiveOuts);
+
+  /// updateLocalLiveness - update only kill flags of operands.
+  inline bool updateLocalLiveness(MachineFunction &Fn);
+
+  /// updateLocalLiveness - update only kill flags of operands.
+  bool updateLocalLiveness(MachineBasicBlock *MBB, bool UpdateBundle);
+
+  /// incrementalUpdate - update the liveness when \p MIDelta is moved from
+  /// \p From to \p To.
+  /// @note: This is extremely fragile now. It 'assumes' that the other
+  /// successor(s) of \p To do not use Defs of MIDelta.
+  /// It deletes the live-in of the \p From MBB.
+  bool incrementalUpdate(MICInstIterType MIDelta, MachineBasicBlock *From,
+                         MachineBasicBlock *To);
+
+  /// addNewMBB - inform the LiveVariable Analysis that new MBB has been added.
+  /// update the liveness of this new MBB.
+  /// @note MBB should be empty. If we want to add an MI, add it after calling
+  /// this function.
+  void addNewMBB(MachineBasicBlock *MBB);
+
+  void addNewMI(MachineInstr *MI, MachineBasicBlock *MBB);
+  unsigned getNumRegs() const { return NumRegs; }
+
+  // Useful for clearing out after passes which move instructions around.
+  // e.g. GlobalScheduler.
+  void clearDistanceMap() { DistanceMap.clear(); }
+
+  /// Computes \p DistanceMap.
+  void generateDistanceMap(const MachineFunction &Fn);
+
+public:
+  bool runOnMachineFunction(MachineFunction &Fn, MachineDominatorTree &MDT,
+                            MachinePostDominatorTree &MPDT);
+};
+
+//===----------------------------------------------------------------------===//
+//                    HexagonLiveVariables Functions
+//===----------------------------------------------------------------------===//
+HexagonLiveVariables::HexagonLiveVariables()
+    : MachineFunctionPass(ID), HLVComplete(false),
+      HLV(std::make_unique<HexagonLiveVariablesImpl>()) {
+  initializeHexagonLiveVariablesPass(*PassRegistry::getPassRegistry());
+}
+
+void HexagonLiveVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addRequired<MachinePostDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
+  AU.addPreserved("packets");
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void HexagonLiveVariables::recalculate(MachineFunction &MF) {
+  if (HLVComplete)
+    return;
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  auto &MPDT =
+      getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+  HLV->runOnMachineFunction(MF, MDT, MPDT);
+}
+
+bool HexagonLiveVariables::updateLocalLiveness(MachineFunction &Fn) {
+  return HLV->updateLocalLiveness(Fn);
+}
+
+bool HexagonLiveVariables::updateLocalLiveness(MachineBasicBlock *MBB,
+                                               bool updateBundle) {
+  HLV->constructUseDef(MBB); // XXX: This destroys MBBLiveOuts!
+  return HLV->updateLocalLiveness(MBB, updateBundle);
+}
+
+bool HexagonLiveVariables::incrementalUpdate(MICInstIterType MIDelta,
+                                             MachineBasicBlock *From,
+                                             MachineBasicBlock *To) {
+  assert(MIDelta->getParent() == To);
+  assert(From != To);
+  return HLV->incrementalUpdate(MIDelta, From, To);
+}
+
+void HexagonLiveVariables::addNewMBB(MachineBasicBlock *MBB) {
+  assert(MBB->empty());
+  HLV->addNewMBB(MBB);
+}
+
+void HexagonLiveVariables::addNewMI(MachineInstr *MI, MachineBasicBlock *MBB) {
+  HLV->addNewMI(MI, MBB);
+}
+
+void HexagonLiveVariables::constructUseDef(MachineBasicBlock *MBB) {
+  HLV->constructUseDef(MBB);
+}
+
+bool HexagonLiveVariables::runOnMachineFunction(MachineFunction &Fn) {
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  auto &MPDT =
+      getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
+  HLVComplete = !HLV->runOnMachineFunction(Fn, MDT, MPDT);
+  return HLVComplete;
+}
+
+bool HexagonLiveVariables::isLiveOut(const MachineBasicBlock *MBB,
+                                     unsigned Reg) const {
+  assert(HLVComplete && "Liveness Analysis not available");
+  auto It = HLV->MBBLiveOuts.find(MBB);
+  if (It == HLV->MBBLiveOuts.end())
+    llvm_unreachable("MBB not found in liveness map");
+  if (Reg >= It->second.size())
+    llvm_unreachable("Register index out of bounds");
+  return It->second[Reg];
+}
+
+const BitVector &
+HexagonLiveVariables::getLiveOuts(const MachineBasicBlock *MBB) const {
+  assert(HLVComplete && "Liveness Analysis not available");
+  auto It = HLV->MBBLiveOuts.find(MBB);
+  if (It == HLV->MBBLiveOuts.end())
+    llvm_unreachable("MBB not found in liveness map");
+  return It->second;
+}
+
+// Returns true when \p Reg is used within [MIBegin, MIEnd)
+// @note: MIBegin and MIEnd should be from same MBB
+// @note: It returns just the first use found in the range.
+// The Use is closest to MIEnd.
+// Takes care of aliases and predicated defs as well.
+bool HexagonLiveVariables::isUsedWithin(
+    MICInstIterType MIBegin, MICInstIterType MIEnd, unsigned Reg,
+    MICInstIterType &Use,
+    SmallPtrSet<MachineInstr *, 2> *ExceptionsList) const {
+  assert(HLVComplete && "Liveness Analysis not available");
+  Use = MIEnd;
+  if (MIBegin == MIEnd) // NULL Range.
+    return false;
+  MICInstIterType MII = MIEnd;
+  do {
+    --MII;
+    if (MII->isBundle() || MII->isDebugInstr())
+      continue;
+    if (ExceptionsList && ExceptionsList->contains(&*MII))
+      continue;
+    auto It = HLV->MIUseDefs.find(&*MII);
+    assert(It != HLV->MIUseDefs.end());
+    for (MCRegAliasIterator AI(Reg, HLV->TRI, true); AI.isValid(); ++AI)
+      if (It->second.first[*AI]) {
+        Use = MII;
+        return true;
+      }
+  } while (MII != MIBegin);
+  return false;
+}
+
+// Returns true when \p Reg id defined within [MIBegin, MIEnd)
+// @note: MIBegin and MIEnd should be from same MBB
+// The Def is closest to MIEnd.
+// Takes care of aliases and predicated defs as well.
+bool HexagonLiveVariables::isDefinedWithin(MICInstIterType MIBegin,
+                                           MICInstIterType MIEnd, unsigned Reg,
+                                           MICInstIterType &Def) const {
+  assert(HLVComplete && "Liveness Analysis not available");
+  Def = MIEnd;
+  if (MIBegin == MIEnd) // NULL Range.
+    return false;
+  MICInstIterType MII = MIEnd;
+  do {
+    --MII;
+    if (MII->isBundle() || MII->isDebugInstr())
+      continue;
+    auto It = HLV->MIUseDefs.find(&*MII);
+    assert(It != HLV->MIUseDefs.end());
+    for (MCRegAliasIterator AI(Reg, HLV->TRI, true); AI.isValid(); ++AI)
+      if (It->second.second[*AI]) {
+        Def = MII;
+        return true;
+      }
+  } while (MII != MIBegin);
+  return false;
+}
+
+// Returns true if any of the defs of MII is live-in in the MBB.
+bool HexagonLiveVariables::isDefLiveIn(const MachineInstr *MI,
+                                       const MachineBasicBlock *MBB) const {
+  assert(HLVComplete && "Liveness Analysis not available");
+  assert(MI && "Invalid machine instruction");
+  assert(MBB && "Invalid machine basic block");
+  auto It = HLV->MIUseDefs.find(MI);
+  assert(It != HLV->MIUseDefs.end() && "Missing MI use/def information");
+  BitVector MBBLiveIns(HLV->NumRegs);
+  for (MachineBasicBlock::livein_iterator lit = MBB->livein_begin();
+       lit != MBB->livein_end(); ++lit) {
+    // Include all the aliases of reg *lit.
+    for (MCRegAliasIterator AI((*lit).PhysReg, HLV->TRI, true); AI.isValid();
+         ++AI)
+      MBBLiveIns.set(*AI);
+  }
+  // Intersect.
+  return MBBLiveIns.anyCommon(It->second.second);
+}
+
+MBBUseDef_t &HexagonLiveVariables::getMBBUseDefs() { return HLV->MBBUseDefs; }
+
+MIUseDef_t &HexagonLiveVariables::getMIUseDefs() { return HLV->MIUseDefs; }
+
+unsigned HexagonLiveVariables::getDistanceBetween(const MachineBasicBlock *From,
+                                                  const MachineBasicBlock *To,
+                                                  unsigned BufferPerMBB) const {
+  assert(HLV->DistanceMap.find(From) != HLV->DistanceMap.end());
+  assert(HLV->DistanceMap.find(To) != HLV->DistanceMap.end());
+  unsigned FromSize = HLV->DistanceMap[From];
+  if (From == To)
+    return FromSize;
+  const MachineFunction *MF = From->getParent();
+  MachineFunction::const_iterator MBBI = MF->begin();
+  unsigned S = BufferPerMBB;
+  bool ToFirst = false;
+  while (MBBI != MF->end()) {
+    const MachineBasicBlock *MBB = &*MBBI;
+    if (MBB == From)
+      break;
+    else if (MBB == To) {
+      ToFirst = true;
+      break;
+    }
+    ++MBBI;
+  }
+  const MachineBasicBlock *ToFind = To;
+  if (ToFirst)
+    ToFind = From;
+  while (MBBI != MF->end()) {
+    const MachineBasicBlock *MBB = &*MBBI;
+    if (MBB == ToFind)
+      break;
+    S += HLV->DistanceMap[MBB] + BufferPerMBB;
+    ++MBBI;
+  }
+  if (ToFirst) // Jump in the opposite direction.
+    S += FromSize + HLV->DistanceMap[To] + 2 * BufferPerMBB;
+  return S;
+}
+
+void HexagonLiveVariables::regenerateDistanceMap(const MachineFunction &Fn) {
+  HLV->clearDistanceMap();
+  HLV->generateDistanceMap(Fn);
+}
+
+//===----------------------------------------------------------------------===//
+//                    HexagonLiveVariablesImpl Functions
+//===----------------------------------------------------------------------===//
+bool HexagonLiveVariablesImpl::runOnMachineFunction(
+    MachineFunction &Fn, MachineDominatorTree &MDT,
+    MachinePostDominatorTree &MPDT) {
+  LLVM_DEBUG(dbgs() << "\nHexagon Live Variables";);
+  Fn.RenumberBlocks();
+  // Update the block numbers in the dominator tree since we preserve it.
+  MDT.updateBlockNumbers();
+  MPDT.updateBlockNumbers();
+
+  MF = &Fn;
+  MRI = &Fn.getRegInfo();
+  auto &ST = Fn.getSubtarget<HexagonSubtarget>();
+  TRI = ST.getRegisterInfo();
+  QII = ST.getInstrInfo();
+
+  NumRegs = TRI->getNumRegs();
+
+  MBBUseDefs.clear();
+  MIUseDefs.clear();
+  MBBLiveOuts.clear();
+
+  LLVM_DEBUG(dbgs() << "\nNumber of registers in Hexagon is:" << NumRegs);
+
+  PhysRegDef.resize(NumRegs);
+  PhysRegUse.resize(NumRegs);
+
+  for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end(); MBBI != E;
+       ++MBBI) {
+    constructUseDef(&*MBBI);
+  }
+  updateGlobalLiveness(Fn);
+  return false;
+}
+
+void HexagonLiveVariablesImpl::constructUseDef(MachineBasicBlock *MBB) {
+  std::fill(PhysRegDef.begin(), PhysRegDef.end(), (MachineInstr *)0);
+  std::fill(PhysRegUse.begin(), PhysRegUse.end(), (MachineInstr *)0);
+
+  // Loop over all of the instructions, processing them.
+  std::pair<BitVector, BitVector> &UseDef = MBBUseDefs[MBB];
+  // Use before any def in a BB.
+  BitVector &Uses = UseDef.first;
+  // Defs before any use in a BB.
+  BitVector &Defs = UseDef.second;
+  // Initializing the LiveOut bit vector.
+  BitVector &LiveOuts = MBBLiveOuts[MBB];
+  Uses.resize(NumRegs, false);
+  Defs.resize(NumRegs, false);
+  LiveOuts.resize(NumRegs, false);
+  // BitVector might contain set bits out of previous liveness updates.
+  Uses.reset();
+  Defs.reset();
+  LiveOuts.reset();
+  LLVM_DEBUG(dbgs() << "\nBB#" << MBB->getNumber(););
+  // MBB Number in the MSB 32 bits.
+  unsigned MBBInsSize = 0;
+  for (MachineBasicBlock::instr_iterator MII = MBB->instr_begin(),
+                                         E = MBB->instr_end();
+       MII != E; ++MII) {
+    MachineInstr *MI = &*MII;
+    MBBInsSize += QII->getSize(*MI);
+    // TODO: Handle isDebugInstr
+    if (MI->isBundle() || MI->isDebugInstr())
+      continue;
+    LLVM_DEBUG(dbgs() << "\n\n" << *MI;);
+    // Clear kill and dead markers. LV will recompute them.
+    UseDef_t &MIUseDef = MIUseDefs[MI];
+    MIUseDef.first.resize(NumRegs);  // Uses
+    MIUseDef.second.resize(NumRegs); // Defs
+    MIUseDef.first.reset();          // Uses
+    MIUseDef.second.reset();         // Defs
+
+    SmallVector<MachineOperand *, 4> UseRegs;
+    SmallVector<MachineOperand *, 4> DefRegs;
+    SmallVector<unsigned, 1> RegMasks;
+    // Process all of the operands of the instruction...
+    unsigned NumOperandsToProcess = MI->getNumOperands();
+    for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isRegMask()) {
+        // Assuming that predicated defs are not defs, for now.
+        if (!QII->isPredicated(*MI))
+          DefRegs.push_back(&MO);
+        continue;
+      }
+      if (!MO.isReg() || MO.getReg() == 0)
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isUse()) {
+        // Assuming that the kill-flags on call-instructions are correct.
+        MO.setIsKill(false);
+        UseRegs.push_back(&MO);
+        MIUseDef.first.set(Reg);
+      } else /*MO.isDef()*/ {
+        assert(MO.isDef());
+        if (!QII->isPredicated(*MI) && !MI->isKill()) {
+          // Assuming that predicated defs are not defs, for now.
+          // KILL instructions are no-ops
+          MO.setIsDead(false);
+          DefRegs.push_back(&MO);
+        }
+        MIUseDef.second.set(Reg); // Set all defs (including predicated).
+      }
+    }
+    // Process all uses.
+    for (unsigned i = 0, e = UseRegs.size(); i != e; ++i)
+      handlePhysRegUse(UseRegs[i], MI, Uses);
+    // Process all defs.
+    for (unsigned i = 0, e = DefRegs.size(); i != e; ++i)
+      handlePhysRegDef(DefRegs[i], MI, Defs);
+  }
+  DistanceMap[MBB] = MBBInsSize;
+}
+
+void HexagonLiveVariablesImpl::handlePhysRegUse(MachineOperand *MO,
+                                                MachineInstr *MI,
+                                                BitVector &Uses) {
+  unsigned Reg = MO->getReg();
+  LLVM_DEBUG(dbgs() << "\nLooking at:";);
+  // If the reg/super-reg is already defined in this MBB => return.
+  for (MCSuperRegIterator SupI(Reg, TRI, true); SupI.isValid(); ++SupI) {
+    LLVM_DEBUG(dbgs() << printReg(*SupI, TRI););
+    if (PhysRegDef[*SupI])
+      return;
+  }
+  // Handle if sub-regs are defined.
+  SmallVector<unsigned, 2> undefSubRegs;
+  bool subRegDefined = false;
+  for (MCSubRegIterator SubI(Reg, TRI); SubI.isValid(); ++SubI) {
+    LLVM_DEBUG(dbgs() << printReg(*SubI, TRI););
+    if (PhysRegDef[*SubI])
+      subRegDefined = true;
+    else
+      undefSubRegs.push_back(*SubI);
+  }
+
+  LLVM_DEBUG(dbgs() << "\nUses:");
+  if (undefSubRegs.empty()) {
+    if (!subRegDefined) { // None of the subregs are defined.
+      // Include all subregs (including self) to the uses.
+      for (MCSubRegIterator SubI(Reg, TRI, true); SubI.isValid(); ++SubI) {
+        LLVM_DEBUG(dbgs() << printReg(*SubI, TRI));
+        PhysRegUse[*SubI] = MI;
+        Uses.set(*SubI);
+      }
+    } // All subregs defined.
+    return;
+  }
+  // Some subregs are defined.
+  for (unsigned i = 0; i < undefSubRegs.size(); ++i) {
+    LLVM_DEBUG(dbgs() << printReg(undefSubRegs[i], TRI));
+    PhysRegUse[undefSubRegs[i]] = MI;
+    Uses.set(undefSubRegs[i]);
+  }
+}
+
+// Assumes that an MI cannot have a reg and its super/sub reg as uses.
+void HexagonLiveVariablesImpl::handlePhysRegDef(MachineOperand *MO,
+                                                MachineInstr *MI,
+                                                BitVector &Defs) {
+  auto SetRegDef = [&](unsigned Reg) -> void {
+    PhysRegDef[Reg] = MI;
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      if (PhysRegUse[*AI]) {
+        LLVM_DEBUG(dbgs() << "\nUsed in current BB:" << printReg(*AI, TRI));
+        return;
+      }
+    }
+    LLVM_DEBUG(dbgs() << "\nDefs:" << printReg(Reg, TRI));
+    Defs.set(Reg);
+  };
+
+  if (MO->isReg()) {
+    SetRegDef(MO->getReg());
+  } else if (MO->isRegMask()) {
+    for (unsigned R = 1, NR = TRI->getNumRegs(); R != NR; ++R)
+      if (MO->clobbersPhysReg(R))
+        SetRegDef(R);
+  }
+}
+
+namespace {
+struct BlockState {
+  bool SuccQueued : 1;
+  bool Done : 1;
+  BlockState() : SuccQueued(false), Done(false) {}
+};
+} // namespace
+
+// Populates 'Blocks' with basic blocks of 'Fn' in depth-first order
+static void gatherBlocksDF(MachineFunction &Fn,
+                           SmallVectorImpl<MachineBasicBlock *> *Blocks) {
+  Blocks->clear();
+  Blocks->reserve(Fn.size());
+
+  SmallVector<BlockState, 16> State(Fn.size());
+  SmallVector<MachineBasicBlock *, 16> WorkStack;
+  WorkStack.push_back(&Fn.front());
+  while (!WorkStack.empty()) {
+    MachineBasicBlock *W = WorkStack.back();
+    BlockState &WState = State[W->getNumber()];
+    if (WState.Done) {
+      WorkStack.pop_back();
+      continue;
+    }
+    if (W->succ_empty() || WState.SuccQueued) {
+      WorkStack.pop_back();
+      Blocks->push_back(W);
+      WState.SuccQueued = true;
+      WState.Done = true;
+      continue;
+    }
+    WState.SuccQueued = true;
+    for (MachineBasicBlock::succ_iterator I = W->succ_begin(),
+                                          E = W->succ_end();
+         I != E; ++I) {
+      MachineBasicBlock *S = *I;
+      if (State[S->getNumber()].SuccQueued)
+        continue;
+      WorkStack.push_back(S);
+    }
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "gatherBlocksDF: {";
+      for (SmallVectorImpl<MachineBasicBlock *>::iterator B = Blocks->begin(),
+           BE = Blocks->end();
+           B != BE; ++B) { dbgs() << " BB#" << (*B)->getNumber(); } dbgs()
+      << " }\n";);
+}
+
+bool HexagonLiveVariablesImpl::updateGlobalLiveness(MachineFunction &Fn) {
+  bool Changed = false;
+  // Removing live-ins and recomputing.
+  MachineFunction::iterator I = Fn.begin(), E = Fn.end();
+  // Not touching the live-ins of entry basic block.
+  for (++I; I != E; ++I) {
+    std::vector<MachineBasicBlock::RegisterMaskPair> OldLiveIn(
+        I->livein_begin(), I->livein_end());
+    for (unsigned i = 0; i < OldLiveIn.size(); ++i)
+      I->removeLiveIn(OldLiveIn[i].PhysReg);
+  }
+
+  gatherBlocksDF(Fn, &BlocksDepthFirst);
+
+  BitVector Defs;
+  BitVector LiveIns;
+  bool Repeat;
+  do {
+    Repeat = false;
+    for (SmallVectorImpl<MachineBasicBlock *>::iterator
+             B = BlocksDepthFirst.begin(),
+             BE = BlocksDepthFirst.end();
+         B != BE; ++B) {
+      Repeat |= updateGlobalLiveness(*B, Defs, LiveIns);
+    }
+    Changed |= Repeat;
+  } while (Repeat);
+
+  Changed |= updateLocalLiveness(Fn);
+  return Changed;
+}
+
+bool HexagonLiveVariablesImpl::updateGlobalLiveness(MachineBasicBlock *X,
+                                                    MachineBasicBlock *Y) {
+  assert(X && "Invalid start block");
+  assert(Y && "Invalid end block");
+
+  bool Changed = false;
+  BitVector Defs;
+  BitVector LiveIns;
+
+  const SmallVectorImpl<MachineBasicBlock *>::iterator BE =
+      BlocksDepthFirst.end();
+  SmallVectorImpl<MachineBasicBlock *>::iterator B;
+  for (B = BlocksDepthFirst.begin(); (B != BE); ++B) {
+    if (*B == X)
+      break;
+    if (*B == Y)
+      break;
+  }
+
+  bool Repeat;
+  do {
+    Repeat = false;
+    for (; B != BE; ++B)
+      Repeat |= updateGlobalLiveness(*B, Defs, LiveIns);
+    Changed |= Repeat;
+    B = BlocksDepthFirst.begin();
+  } while (Repeat);
+
+  return Changed;
+}
+
+// Defs and LiveIns could be local variables within updateGlobalLiveness, but
+// have been pulled out to (hopefully) improve performance.
+bool HexagonLiveVariablesImpl::updateGlobalLiveness(MachineBasicBlock *MBB,
+                                                    BitVector &Defs,
+                                                    BitVector &LiveIns) {
+  LLVM_DEBUG(dbgs() << "\nTrying to Update Liveness MBB#" << MBB->getNumber());
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "\nUpdating Liveness MBB#" << MBB->getNumber());
+  // Update live-outs
+  auto LiveOutIt = MBBLiveOuts.find(MBB);
+  if (LiveOutIt == MBBLiveOuts.end())
+    LiveOutIt = MBBLiveOuts.insert({MBB, BitVector(NumRegs)}).first;
+  BitVector &LiveOuts = LiveOutIt->second;
+  for (MachineBasicBlock::succ_iterator MBBSucc = MBB->succ_begin();
+       MBBSucc != MBB->succ_end(); ++MBBSucc) {
+    MachineBasicBlock *Succ = *MBBSucc;
+    LLVM_DEBUG(dbgs() << "\n\t\tAdding LiveOut:";);
+    for (MachineBasicBlock::livein_iterator LI = Succ->livein_begin(),
+                                            LE = Succ->livein_end();
+         LI != LE; ++LI) {
+      if (!LiveOuts[(*LI).PhysReg]) {
+        LLVM_DEBUG(dbgs() << " " << printReg((*LI).PhysReg, TRI););
+        LiveOuts.set((*LI).PhysReg);
+        Changed = true;
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\nUpdated Successors of MBB#" << MBB->getNumber());
+  // Update live-ins
+  Changed |= updateLiveIns(MBB, LiveIns, LiveOuts);
+
+  return Changed;
+}
+
+// update live-ins when live-out has been calculated
+bool HexagonLiveVariablesImpl::updateLiveIns(MachineBasicBlock *MBB,
+                                             BitVector &LiveIns,
+                                             const BitVector &LiveOuts) {
+  LLVM_DEBUG(dbgs() << "\n[updateLiveIns] MBB#" << MBB->getNumber());
+  bool Changed = false;
+  const std::pair<BitVector, BitVector> &UseDefs = MBBUseDefs[MBB];
+  LiveIns = LiveOuts;
+  // LiveIns = (LiveOuts - Defs) | Uses
+  // Equivalent to: LiveIns = (LiveOuts & ~Defs) | Uses
+  LiveIns.reset(UseDefs.second);
+  LiveIns |= UseDefs.first;
+  LLVM_DEBUG(dbgs() << "\n\t\tAdded LiveIn:";);
+  for (int i = LiveIns.find_first(); i >= 0; i = LiveIns.find_next(i)) {
+    // TODO: remove costly check of MBB->isLiveIn when fully functional.
+    if (!MBB->isLiveIn(i) && MRI->isAllocatable(i)) {
+      LLVM_DEBUG(dbgs() << " " << printReg(i, TRI));
+      MBB->addLiveIn(i);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool HexagonLiveVariablesImpl::updateLiveOuts(MachineBasicBlock *MBB,
+                                              BitVector &LiveOuts) {
+  bool Changed = false;
+  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock *SB = *SI;
+    for (auto I = SB->livein_begin(), E = SB->livein_end(); I != E; ++I) {
+      unsigned R = (*I).PhysReg;
+      if (LiveOuts[R])
+        continue;
+      LiveOuts.set(R);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool HexagonLiveVariablesImpl::updateLocalLiveness(MachineFunction &Fn) {
+  LLVM_DEBUG(dbgs() << "\n[updateLocalLiveness]");
+  for (MachineFunction::iterator B = Fn.begin(), E = Fn.end(); B != E; ++B)
+    updateLocalLiveness(&*B, false);
+  return true;
+}
+
+bool HexagonLiveVariablesImpl::updateLocalLiveness(MachineBasicBlock *MBB,
+                                                   bool UpdateBundle) {
+  assert(MBB && "Invalid basic block");
+  LLVM_DEBUG(dbgs() << "\n[updateLocalLiveness] MBB#" << MBB->getNumber());
+
+  BitVector &LiveOut = MBBLiveOuts[MBB];
+  updateLiveOuts(MBB, LiveOut);
+
+  BitVector Used = LiveOut;
+  SmallVector<MachineInstr *, 2> BundleHeads;
+  // Bottom up traversal of MBB.
+  for (MachineBasicBlock::reverse_instr_iterator MII = MBB->instr_rbegin(),
+                                                 MIREnd = MBB->instr_rend();
+       MII != MIREnd; ++MII) {
+    MachineInstr *MI = &*MII;
+    // The bundle liveness is updated differently.
+    if (MI->isBundle()) {
+      if (UpdateBundle)
+        BundleHeads.push_back(MI);
+      continue;
+    }
+    if (MI->isDebugInstr()) // DBG_VALUE may have invalid reg.
+      continue;
+    SmallVector<MachineOperand *, 4> UseRegs;
+    SmallVector<MachineOperand *, 2> DefRegs;
+    for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg()) { // DBG_VALUE may have invalid reg.
+        if (MO.isUse())
+          UseRegs.push_back(&MO);
+        else { // Def
+          if (!QII->isPredicated(*MI) && !MI->isKill()) {
+            // Assuming that predicated defs are not defs, for now.
+            // KILL instructions are no-ops
+            DefRegs.push_back(&MO);
+          }
+        }
+      } else if (MO.isRegMask()) {
+        if (!QII->isPredicated(*MI))
+          DefRegs.push_back(&MO);
+      }
+    }
+    // In case of a def. remove Reg and its sub-regs from Used list
+    // such that uses in the same MI can be marked as kill.
+    auto RemoveDef = [&](unsigned Reg, bool Implicit) -> void {
+      for (MCSubRegIterator SI(Reg, TRI, true); SI.isValid(); ++SI) {
+        Used.reset(*SI);
+        if (Implicit) {
+          // For implicit defs, check if there is an implicit use of an
+          // aliased register. If so, mark the aliased reg as used.
+          for (auto *UseOp : UseRegs)
+            if (UseOp->isImplicit() && TRI->regsOverlap(*SI, UseOp->getReg()))
+              Used.set(UseOp->getReg());
+        }
+      }
+    };
+    for (unsigned i = 0; i < DefRegs.size(); ++i) {
+      MachineOperand &MO = *DefRegs[i];
+      if (MO.isReg()) {
+        RemoveDef(MO.getReg(), MO.isImplicit());
+      } else if (MO.isRegMask()) {
+        for (unsigned R = 1, NR = TRI->getNumRegs(); R != NR; ++R)
+          if (MO.clobbersPhysReg(R))
+            RemoveDef(R, true);
+      }
+    }
+    // The order is important as we are looking from right to left.
+    for (unsigned i = UseRegs.size(); i > 0;) {
+      --i;
+      unsigned UseReg = UseRegs[i]->getReg();
+      bool Killed = true;
+      for (MCRegAliasIterator AI(UseReg, TRI, true); AI.isValid(); ++AI) {
+        if (Used[*AI])
+          Killed = false;
+      }
+      Used.set(UseReg);
+      if (Killed && !UseRegs[i]->isDebug())
+        UseRegs[i]->setIsKill(true);
+    }
+  }
+  // Recreates bundle for updating liveness.
+  for (SmallVectorImpl<MachineInstr *>::iterator MII = BundleHeads.begin();
+       MII != BundleHeads.end(); ++MII) {
+    MachineInstr *MI = *MII;
+    assert(MI && "Invalid bundle head");
+    assert(MI->isBundle() && "Expected a bundle head instruction");
+    assert(MI->getParent() == MBB && "Bundle head not in expected block");
+    MachineBasicBlock::instr_iterator BS = MI->getIterator();
+    MachineBasicBlock::instr_iterator BE = getBundleEnd(BS);
+    for (++BS; BS != BE; ++BS)
+      // Remove from bundle so that BUNDLE head can be erased.
+      BS->unbundleFromPred();
+
+    BS = MI->getIterator();
+    ++BS;
+    bool memShufDisabled = QII->getBundleNoShuf(*MI);
+    MI->eraseFromParent();
+    finalizeBundle(*MBB, BS, BE);
+    MachineBasicBlock::instr_iterator BundleMII = std::prev(BS);
+    if (memShufDisabled)
+      QII->setBundleNoShuf(BundleMII);
+  }
+  return true;
+}
+
+// It deletes the live-in of the \p From MBB.
+bool HexagonLiveVariablesImpl::incrementalUpdate(MICInstIterType MIDelta,
+                                                 MachineBasicBlock *From,
+                                                 MachineBasicBlock *To) {
+  while (!From->livein_empty())
+    From->removeLiveIn((*From->livein_begin()).PhysReg);
+  // Handle MI use-def of From.
+  constructUseDef(From);
+  // Handle MI use-def of To.
+  constructUseDef(To);
+  // Calculate live-in of From and To
+  // Reuse this by setting all MBBs except From and To as visited.
+  updateGlobalLiveness(From, To);
+  // Update local liveness of To.
+  updateLocalLiveness(From, true);
+  updateLocalLiveness(To, true);
+
+  // Do this after the liveness update because MIDelta might not be in the
+  // MIUseDefs before liveness update (since MIDelta might be newly inserted).
+  MIUseDef_t::const_iterator MIUseDef = MIUseDefs.find(&*MIDelta);
+  if (MIUseDef == MIUseDefs.end())
+    llvm_unreachable("MIDelta not found in MIUseDefs after liveness update");
+  const BitVector &Defs = MIUseDef->second.second;
+  int Reg = Defs.find_first();
+  // Adding all the defs as live-ins. This is conservative approach but we
+  // need to add them so as to avoid dealing with callee saved registers and
+  // any unwanted errors in liveness that might arise.
+  while (Reg >= 0) {
+    From->addLiveIn(Reg);
+    Reg = Defs.find_next(Reg);
+  }
+  return true;
+}
+
+void HexagonLiveVariablesImpl::addNewMBB(MachineBasicBlock *MBB) {
+  // Resize and init.
+  constructUseDef(MBB); // This is to set up some containers for MBB.
+  gatherBlocksDF(*MBB->getParent(), &BlocksDepthFirst);
+  updateGlobalLiveness(MBB, MBB);
+}
+
+// TODO: This is a slow implementation because constructUseDef destroys
+// the MBBLiveOuts which is generated again by updateGlobalLiveness.
+void HexagonLiveVariablesImpl::addNewMI(MachineInstr *MI,
+                                        MachineBasicBlock *MBB) {
+  constructUseDef(MBB); // This is to set up some containers for MBB.
+  updateGlobalLiveness(MBB, MBB);
+}
+
+void HexagonLiveVariablesImpl::generateDistanceMap(const MachineFunction &Fn) {
+  assert(DistanceMap.empty() && "DistanceMap not empty, first clear!");
+  for (MachineFunction::const_iterator MBBI = Fn.begin(), E = Fn.end();
+       MBBI != E; ++MBBI) {
+    const MachineBasicBlock *MBB = &*MBBI;
+    unsigned MBBInsSize = 0;
+    for (MachineBasicBlock::const_instr_iterator MII = MBB->instr_begin(),
+                                                 E = MBB->instr_end();
+         MII != E; ++MII) {
+      const MachineInstr *MI = &*MII;
+      MBBInsSize += QII->getSize(*MI);
+    }
+    DistanceMap[MBB] = MBBInsSize;
+  }
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonLiveVariables.h b/llvm/lib/Target/Hexagon/HexagonLiveVariables.h
new file mode 100644
index 0000000000000..396145d49db45
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonLiveVariables.h
@@ -0,0 +1,134 @@
+//===----------------- HexagonLiveVariables.h ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Hexagon Live Variable Analysis
+// This file implements the Hexagon specific LiveVariables analysis pass.
+// 1. Computes the live variables by analyzing the use-defs.
+//      - The use-def specifiers are 'assumed' to be correct for each operand.
+// 2. Re-calculates the MBB numbers to that they are in sequence.
+// TODO: Mark dead instructions.
+// TODO: Provide APIs like the target independent Liveness Analysis so that
+//       other passes can reuse the liveness information.
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGON_LIVEVARIABLES_H
+#define HEXAGON_LIVEVARIABLES_H
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cstdint>
+#include <list>
+
+class HexagonLiveVariablesImpl;
+
+namespace llvm {
+
+typedef std::pair<BitVector, BitVector> UseDef_t; // (Use, Def)
+typedef DenseMap<MachineBasicBlock *, UseDef_t> MBBUseDef_t;
+typedef DenseMap<const MachineInstr *, UseDef_t> MIUseDef_t;
+
+// List of intervals [From, To).
+typedef std::list<std::pair<int64_t, int64_t>> IntervalList_t;
+// Intervals stored in indexed form.
+typedef SmallVector<IntervalList_t, 0> IndexedLiveIntervals_t;
+
+class HexagonLiveVariables : public MachineFunctionPass {
+public:
+  typedef MachineBasicBlock::const_instr_iterator MICInstIterType;
+
+  static char ID; // Pass identification, replacement for typeid
+  bool HLVComplete;
+  HexagonLiveVariables();
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  StringRef getPassName() const override {
+    return "Hexagon Live Variables Analysis";
+  }
+
+  /// recalculate - recalculates the liveness from scratch. It is like
+  /// calling the runOnMachineFunction.
+  void recalculate(MachineFunction &MF);
+
+  /// updateLocalLiveness - update only kill flags of operands.
+  /// Assumes that global liveness is correct.
+  bool updateLocalLiveness(MachineFunction &Fn);
+
+  /// updateLocalLiveness - update only kill flags of operands in MBB.
+  /// Assumes that global liveness is correct.
+  /// This is useful when a local transformation modifies MIs,
+  /// which only changes the local liveness.
+  bool updateLocalLiveness(MachineBasicBlock *MBB, bool updateBundle);
+
+  /// incrementalUpdate - update the liveness when \p MIDelta is moved from
+  /// \p From to \p To.
+  /// @note: This is extremely fragile now. It 'assumes' that the other
+  /// successor(s) of \p To do not use Defs of MIDelta.
+  bool incrementalUpdate(MICInstIterType MIDelta, MachineBasicBlock *From,
+                         MachineBasicBlock *To);
+  // addNewMI - update internal data-structures of Live Variable Analysis.
+  void addNewMI(MachineInstr *MI, MachineBasicBlock *MBB);
+
+  /// addNewMBB - inform the LiveVariable Analysis that new MBB has been added.
+  /// update the liveness of this new MBB.
+  /// @note MBB should be empty. If we want to add an MI, add it after calling
+  /// this function.
+  void addNewMBB(MachineBasicBlock *MBB);
+
+  /// @brief Constructs use-defs of \p MBB by analyzing each MachineOperand.
+  /// Collects relevant information so that global liveness can be updated.
+  void constructUseDef(MachineBasicBlock *MBB);
+
+  bool isLiveOut(const MachineBasicBlock *MBB, unsigned Reg) const;
+  const BitVector &getLiveOuts(const MachineBasicBlock *MBB) const;
+
+  // Returns true when \p Reg is used within [MIBegin, MIEnd)
+  // @note: MIBegin and MIEnd should be from same MBB
+  // @note: It returns just the first use found in the range.
+  // The Use is closest to MIEnd.
+  // Takes care of aliases as well.
+  bool
+  isUsedWithin(MICInstIterType MIBegin, MICInstIterType MIEnd, unsigned Reg,
+               MICInstIterType &Use,
+               SmallPtrSet<MachineInstr *, 2> *ExceptionsList = nullptr) const;
+  // Returns true when \p Reg id defined within [MIBegin, MIEnd)
+  // @note: MIBegin and MIEnd should be from same MBB
+  // The Def is closest to MIEnd.
+  // Takes care of aliases as well.
+  bool isDefinedWithin(MICInstIterType MIBegin, MICInstIterType MIEnd,
+                       unsigned Reg, MICInstIterType &Def) const;
+  bool isDefLiveIn(const MachineInstr *MI, const MachineBasicBlock *MBB) const;
+  MBBUseDef_t &getMBBUseDefs();
+  MIUseDef_t &getMIUseDefs();
+
+  /// Returns the linear distance (as per layout) of \p MI from the Function.
+  /// \p BufferPerMBB is to allow some room for .falign (if added later).
+  unsigned getDistanceBetween(const MachineBasicBlock *From,
+                              const MachineBasicBlock *To,
+                              unsigned BufferPerMBB = HEXAGON_INSTR_SIZE) const;
+
+  // recalculate the distance map.
+  void regenerateDistanceMap(const MachineFunction &Fn);
+
+private:
+  std::unique_ptr<HexagonLiveVariablesImpl> HLV;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 2f14622cab57c..cfe898fe767dc 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -66,6 +66,9 @@ static cl::opt<bool> DisableHexagonMask(
     "disable-mask", cl::Hidden,
     cl::desc("Disable Hexagon specific Mask generation pass"));
 
+static cl::opt<bool> DisableHexagonLiveVars(
+    "disable-hlv", cl::Hidden,
+    cl::desc("Disable Hexagon specific post-RA live-variable analysis"));
 static cl::opt<bool> DisableStoreWidening("disable-store-widen", cl::Hidden,
                                           cl::init(false),
                                           cl::desc("Disable store widening"));
@@ -192,6 +195,7 @@ LLVMInitializeHexagonTarget() {
   initializeHexagonEarlyIfConversionPass(PR);
   initializeHexagonGenMemAbsolutePass(PR);
   initializeHexagonGenMuxPass(PR);
+  initializeHexagonLiveVariablesPass(PR);
   initializeHexagonHardwareLoopsPass(PR);
   initializeHexagonLoopIdiomRecognizeLegacyPassPass(PR);
   initializeHexagonNewValueJumpPass(PR);
@@ -448,6 +452,10 @@ void HexagonPassConfig::addPreSched2() {
   addPass(createHexagonSplitConst32AndConst64());
   if (!NoOpt && !DisableHexagonMask)
     addPass(createHexagonMask());
+
+  if (!NoOpt && !DisableHexagonLiveVars) {
+    addPass(&HexagonLiveVariablesID);
+  }
 }
 
 void HexagonPassConfig::addPreEmitPass() {
@@ -464,6 +472,8 @@ void HexagonPassConfig::addPreEmitPass() {
     // Generate MUX from pairs of conditional transfers.
     if (EnableGenMux)
       addPass(createHexagonGenMux());
+    if (!DisableHexagonLiveVars)
+      addPass(&HexagonLiveVariablesID);
   }
 
   // Packetization is mandatory: it handles gather/scatter at all opt levels.
diff --git a/llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll b/llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll
index 463e34da7a5c0..34eec1faa951c 100644
--- a/llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll
+++ b/llvm/test/CodeGen/Hexagon/isel/mulh-scalar.ll
@@ -209,13 +209,13 @@ define <4 x i16> @f6(<4 x i16> %a0, <4 x i16> %a1) #0 {
 ; CHECK-NEXT:     r5:4 = vmpyh(r0,r2):sat
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r7:6 = vmpyh(r1,r3):sat
+; CHECK-NEXT:     r{{[0-9]+}}:{{[0-9]+}} = vmpyh(r1,r3):sat
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r0 = combine(r5.h,r4.h)
+; CHECK-NEXT:     r0 = combine(r{{[0-9]+}}.h,r{{[0-9]+}}.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r1 = combine(r7.h,r6.h)
+; CHECK-NEXT:     r1 = combine(r{{[0-9]+}}.h,r{{[0-9]+}}.h)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     jumpr r31
diff --git a/llvm/test/CodeGen/Hexagon/live-vars/live-outs.ll b/llvm/test/CodeGen/Hexagon/live-vars/live-outs.ll
new file mode 100644
index 0000000000000..08b1195928d1f
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/live-vars/live-outs.ll
@@ -0,0 +1,77 @@
+; RUN: llc -O3 -verify-machineinstrs < %s -o /dev/null
+; REQUIRES: asserts
+;
+; This is a compile-only regression test (asserts build) for Hexagon.
+
+define i32 @foo(ptr nocapture readnone %x, i32 %n, ptr nocapture readonly %p,
+                ptr nocapture readonly %q, ptr %b) {
+entry:
+  %cmp = icmp eq i32 %n, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %div = lshr i32 %n, 3
+  %cmp149 = icmp eq i32 %div, 0
+  br i1 %cmp149, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %if.end
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %arrayidx.phi = phi ptr [ %arrayidx.inc, %for.inc ], [ %p, %for.body.preheader ]
+  %arrayidx2.phi = phi ptr [ %arrayidx2.inc, %for.inc ], [ %q, %for.body.preheader ]
+  %i.050 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %0 = load i8, ptr %arrayidx.phi, align 1
+  %1 = load i8, ptr %arrayidx2.phi, align 1
+  %cmp4 = icmp eq i8 %0, %1
+  br i1 %cmp4, label %for.inc, label %for.end.loopexit
+
+for.inc:                                          ; preds = %for.body
+  %arrayidx2.inc = getelementptr i8, ptr %arrayidx2.phi, i32 1
+  %arrayidx.inc = getelementptr i8, ptr %arrayidx.phi, i32 1
+  %inc = add nuw nsw i32 %i.050, 1
+  %cmp1 = icmp ult i32 %inc, %div
+  br i1 %cmp1, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body, %for.inc
+  %i.0.lcssa.ph = phi i32 [ %i.050, %for.body ], [ %inc, %for.inc ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %if.end
+  %i.0.lcssa = phi i32 [ 0, %if.end ], [ %i.0.lcssa.ph, %for.end.loopexit ]
+  %cmp8 = icmp eq i32 %i.0.lcssa, %div
+  br i1 %cmp8, label %if.end30, label %if.then10
+
+if.then10:                                        ; preds = %for.end
+  %rem = and i32 %n, 7
+  %cmp11 = icmp eq i32 %rem, 0
+  br i1 %cmp11, label %return, label %if.end14
+
+if.end14:                                         ; preds = %if.then10
+  %sub = sub nsw i32 8, %rem
+  %shl = shl i32 1, %sub
+  %sub16 = add i32 %shl, 255
+  %arrayidx18 = getelementptr inbounds i8, ptr %p, i32 %i.0.lcssa
+  %2 = load i8, ptr %arrayidx18, align 1
+  %sub16.not = or i32 %sub16, -256
+  %neg = xor i32 %sub16.not, 255
+  %arrayidx21 = getelementptr inbounds i8, ptr %q, i32 %i.0.lcssa
+  %3 = load i8, ptr %arrayidx21, align 1
+  %4 = xor i8 %3, %2
+  %5 = zext i8 %4 to i32
+  %6 = and i32 %5, %neg
+  %cmp26 = icmp eq i32 %6, 0
+  br i1 %cmp26, label %return, label %if.end30
+
+if.end30:                                         ; preds = %for.end, %if.end14
+  %cmp31 = icmp eq ptr %b, null
+  br i1 %cmp31, label %return, label %if.then33
+
+if.then33:                                        ; preds = %if.end30
+  store i8 0, ptr %b, align 1
+  br label %return
+
+return:                                           ; preds = %if.end30, %if.then33, %if.end14, %if.then10, %entry
+  %retval.0 = phi i32 [ 0, %entry ], [ 1, %if.then10 ], [ 1, %if.end14 ], [ 0, %if.then33 ], [ 0, %if.end30 ]
+  ret i32 %retval.0
+}
diff --git a/llvm/test/CodeGen/Hexagon/nbench1.ll b/llvm/test/CodeGen/Hexagon/nbench1.ll
index 25a83db5717e3..04335ce4fa1c1 100644
--- a/llvm/test/CodeGen/Hexagon/nbench1.ll
+++ b/llvm/test/CodeGen/Hexagon/nbench1.ll
@@ -3,9 +3,9 @@
 ; if instruction being considered for addition to packet has higher latency,
 ; end existing packet and start a new one.
 
-; CHECK: .LBB0_4:
 ; CHECK: p{{[0-3]+}} = cmp.gtu(r{{[0-9]+}},r{{[0-9]+}})
-; CHECK-NEXT: }
+; CHECK: if (p{{[0-3]+}}.new) jumpr:nt r31
+; CHECK: }
 
 @array = external dso_local local_unnamed_addr global ptr, align 4
 
diff --git a/llvm/test/CodeGen/Hexagon/newvaluejump-kill.ll b/llvm/test/CodeGen/Hexagon/newvaluejump-kill.ll
index e3f36988f9387..a6c3b599586f6 100644
--- a/llvm/test/CodeGen/Hexagon/newvaluejump-kill.ll
+++ b/llvm/test/CodeGen/Hexagon/newvaluejump-kill.ll
@@ -2,7 +2,8 @@
 ;
 ; Check that this testcase compiles successfully and that a new-value jump
 ; has been created.
-; CHECK: if (cmp.gtu(r{{[0-9]+}}.new,r{{[0-9]+}})) jump
+; CHECK: p{{[0-3]+}} = cmp.gtu(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-SAME: if (p{{[0-3]+}}.new) jump
 
 target triple = "hexagon"
 
diff --git a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
index a0aeb80a5fa93..cfb459c6d2055 100644
--- a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
@@ -3,7 +3,7 @@
 ; From coremark. Test that we pipeline the matrix multiplication bitextract
 ; function. The pipelined code should have two packets.
 
-; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: loop{{[01]}}(.LBB0_[[LOOP:[0-9]+]],r{{[0-9]+}})
 ; CHECK: .LBB0_[[LOOP]]:
 ; CHECK: [[REG0:(r[0-9]+)]] = mpyi([[REG1:(r[0-9]+)]],[[REG2:(r[0-9]+)]])
 ; CHECK: += mpyi
@@ -11,7 +11,7 @@
 ; CHECK: = extractu([[REG0:(r[0-9]+)]],
 ; CHECK: = extractu([[REG0]],
 ; CHECK: [[REG2:(r[0-9]+)]] = memh
-; CHECK: endloop0
+; CHECK: endloop{{[01]}}
 
 %union_h2_sem_t = type { i32 }
 

>From 9831be9d091bac4cdfa6d499dce5a23af4cb249a Mon Sep 17 00:00:00 2001
From: Kewen Meng <Kewen.Meng at amd.com>
Date: Fri, 6 Feb 2026 13:24:40 -0800
Subject: [PATCH 04/13] Revert "[LV] Support conditional scalar assignments of
 masked operations" (#180275)

Reverts llvm/llvm-project#178862

revert to unblock bot:
https://lab.llvm.org/buildbot/#/builders/206/builds/13225
---
 llvm/lib/Analysis/IVDescriptors.cpp           |   53 +-
 .../Vectorize/VPlanConstruction.cpp           |   27 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |    2 +-
 .../AArch64/conditional-scalar-assignment.ll  | 1144 -----------------
 .../X86/vectorization-remarks-missed.ll       |    9 +
 .../conditional-scalar-assignment-vplan.ll    |  100 --
 llvm/unittests/Analysis/IVDescriptorsTest.cpp |   79 --
 7 files changed, 16 insertions(+), 1398 deletions(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 289da458c9c08..2a213d6be1470 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -261,23 +261,6 @@ static bool isMinMaxReductionPhiWithUsersOutsideReductionChain(
   return true;
 }
 
-// This matches a phi that selects between the original value (HeaderPhi) and an
-// arbitrary non-reduction value.
-static bool isFindLastLikePhi(PHINode *Phi, PHINode *HeaderPhi,
-                              SmallPtrSetImpl<Instruction *> &ReductionInstrs) {
-  unsigned NumNonReduxInputs = 0;
-  for (const Value *Op : Phi->operands()) {
-    if (!ReductionInstrs.contains(dyn_cast<Instruction>(Op))) {
-      if (++NumNonReduxInputs > 1)
-        return false;
-    } else if (Op != HeaderPhi) {
-      // TODO: Remove this restriction once chained phis are supported.
-      return false;
-    }
-  }
-  return NumNonReduxInputs == 1;
-}
-
 bool RecurrenceDescriptor::AddReductionVar(
     PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
     RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
@@ -324,14 +307,6 @@ bool RecurrenceDescriptor::AddReductionVar(
   unsigned NumCmpSelectPatternInst = 0;
   InstDesc ReduxDesc(false, nullptr);
 
-  // To recognize find-lasts of conditional operations (such as loads or
-  // divides), that need masking, we track non-phi users and if we've found a
-  // "find-last-like" phi (see isFindLastLikePhi). We currently only support
-  // find-last reduction chains with a single "find-last-like" phi and do not
-  // allow any other operations.
-  unsigned NumNonPHIUsers = 0;
-  bool FoundFindLastLikePhi = false;
-
   // Data used for determining if the recurrence has been type-promoted.
   Type *RecurrenceType = Phi->getType();
   SmallPtrSet<Instruction *, 4> CastInsts;
@@ -439,8 +414,6 @@ bool RecurrenceDescriptor::AddReductionVar(
       return false;
 
     bool IsAPhi = isa<PHINode>(Cur);
-    if (!IsAPhi)
-      ++NumNonPHIUsers;
 
     // A header PHI use other than the original PHI.
     if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
@@ -497,21 +470,9 @@ bool RecurrenceDescriptor::AddReductionVar(
         !isAnyOfRecurrenceKind(Kind) && hasMultipleUsesOf(Cur, VisitedInsts, 1))
       return false;
 
-    // All inputs to a PHI node must be a reduction value, unless the phi is a
-    // "FindLast-like" phi (described below).
-    if (IsAPhi && Cur != Phi) {
-      if (!areAllUsesIn(Cur, VisitedInsts)) {
-        // A "FindLast-like" phi acts like a conditional select between the
-        // previous reduction value, and an arbitrary value. Note: Multiple
-        // "FindLast-like" phis are not supported see:
-        // IVDescriptorsTest.UnsupportedFindLastPhi.
-        FoundFindLastLikePhi =
-            Kind == RecurKind::FindLast && !FoundFindLastLikePhi &&
-            isFindLastLikePhi(cast<PHINode>(Cur), Phi, VisitedInsts);
-        if (!FoundFindLastLikePhi)
-          return false;
-      }
-    }
+    // All inputs to a PHI node must be a reduction value.
+    if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
+      return false;
 
     if (isIntMinMaxRecurrenceKind(Kind) && (isa<ICmpInst>(Cur) || IsASelect))
       ++NumCmpSelectPatternInst;
@@ -521,7 +482,7 @@ bool RecurrenceDescriptor::AddReductionVar(
       ++NumCmpSelectPatternInst;
 
     // Check  whether we found a reduction operator.
-    FoundReduxOp |= (!IsAPhi || FoundFindLastLikePhi) && Cur != Start;
+    FoundReduxOp |= !IsAPhi && Cur != Start;
 
     // Process users of current instruction. Push non-PHI nodes after PHI nodes
     // onto the stack. This way we are going to have seen all inputs to PHI
@@ -595,12 +556,6 @@ bool RecurrenceDescriptor::AddReductionVar(
     Worklist.append(NonPHIs.begin(), NonPHIs.end());
   }
 
-  // We only expect to match a single "find-last-like" phi per find-last
-  // reduction, with no non-phi operations in the reduction use chain.
-  assert((!FoundFindLastLikePhi ||
-          (Kind == RecurKind::FindLast && NumNonPHIUsers == 0)) &&
-         "Unexpectedly matched a 'find-last-like' phi");
-
   // This means we have seen one but not the other instruction of the
   // pattern or more than just a select and cmp. Zero implies that we saw a
   // llvm.min/max intrinsic, which is always OK.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index eea9952243cd3..94d22cc67088d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1356,34 +1356,11 @@ bool VPlanTransforms::handleFindLastReductions(VPlan &Plan) {
                      PhiR->getRecurrenceKind()))
       continue;
 
-    // Find the condition for the select/blend.
+    // Find the condition for the select.
     auto *SelectR = cast<VPSingleDefRecipe>(&PhiR->getBackedgeRecipe());
     VPValue *Cond = nullptr, *Op1 = nullptr, *Op2 = nullptr;
-
-    // If we're matching a blend rather than a select, there should be one
-    // incoming value which is the data, then all other incoming values should
-    // be the phi.
-    auto MatchBlend = [&](VPRecipeBase *R) {
-      auto *Blend = dyn_cast<VPBlendRecipe>(R);
-      if (!Blend)
-        return false;
-      assert(!Blend->isNormalized() && "must run before blend normalizaion");
-      unsigned NumIncomingDataValues = 0;
-      for (unsigned I = 0; I < Blend->getNumIncomingValues(); ++I) {
-        VPValue *Incoming = Blend->getIncomingValue(I);
-        if (Incoming != PhiR) {
-          ++NumIncomingDataValues;
-          Cond = Blend->getMask(I);
-          Op1 = Incoming;
-          Op2 = PhiR;
-        }
-      }
-      return NumIncomingDataValues == 1;
-    };
-
     if (!match(SelectR,
-               m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2))) &&
-        !MatchBlend(SelectR))
+               m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2))))
       return false;
 
     // Add mask phi.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1d0c3d7a61f94..6057de60adb72 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2742,7 +2742,7 @@ void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
       O << " ";
       getIncomingValue(I)->printAsOperand(O, SlotTracker);
-      if (I == 0 && isNormalized())
+      if (I == 0)
         continue;
       O << "/";
       getMask(I)->printAsOperand(O, SlotTracker);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index 7053aa60b2035..aa31b0ff2bb1a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -823,1150 +823,6 @@ exit:
   ret i32 %select.data
 }
 
-; This test is derived from the following C program:
-; int simple_csa_int_load(int* a, int* b, int default_val, int N, int threshold) {
-;   int result = default_val;
-;   for (int i = 0; i < N; ++i)
-;     if (a[i] > threshold)
-;       result = b[i];
-;   return result;
-; }
-define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
-; NEON-LABEL: define i32 @simple_csa_int_load(
-; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; NEON-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; NEON-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; NEON-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; NEON-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; NEON-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; NEON:       [[IF_THEN]]:
-; NEON-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; NEON-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
-; NEON-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-; SVE-LABEL: define i32 @simple_csa_int_load(
-; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
-; SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; SVE:       [[VECTOR_PH]]:
-; SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
-; SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SVE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DEFAULT_VAL]], i64 0
-; SVE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
-; SVE:       [[VECTOR_BODY]]:
-; SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
-; SVE-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; SVE-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
-; SVE-NEXT:    [[TMP8:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
-; SVE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP8]])
-; SVE-NEXT:    [[TMP10]] = select i1 [[TMP9]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
-; SVE-NEXT:    [[TMP11]] = select i1 [[TMP9]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
-; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; SVE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; SVE:       [[MIDDLE_BLOCK]]:
-; SVE-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
-; SVE-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP13]])
-; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; SVE:       [[SCALAR_PH]]:
-; SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ [[DEFAULT_VAL]], %[[ENTRY]] ]
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; SVE-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; SVE-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; SVE-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; SVE-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; SVE-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; SVE:       [[IF_THEN]]:
-; SVE-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; SVE-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
-; SVE-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
-  %data.phi = phi i32 [ %default_val, %entry ], [ %select.data, %latch ]
-  %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
-  %ld.a = load i32, ptr %a.addr, align 4
-  %if.cond = icmp sgt i32 %ld.a, %threshold
-  br i1 %if.cond, label %if.then, label %latch
-
-if.then:
-  %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
-  %ld.b = load i32, ptr %b.addr, align 4
-  br label %latch
-
-latch:
-  %select.data = phi i32 [ %ld.b, %if.then ], [ %data.phi, %loop ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exit.cmp = icmp eq i64 %iv.next, %N
-  br i1 %exit.cmp, label %exit, label %loop
-
-exit:
-  ret i32 %select.data
-}
-
-; This test is derived from the following loop:
-;   int simple_csa_int_divide(int* a, int* b, int default_val, int N, int threshold) {
-;     int result = default_val;
-;     for (int i = 0; i < N; ++i)
-;       if (a[i] > threshold)
-;         result = 42 / a[i]
-;     return result;
-;   }
-define i32 @simple_csa_int_divide(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
-; NEON-LABEL: define i32 @simple_csa_int_divide(
-; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; NEON-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; NEON-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; NEON-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; NEON-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; NEON-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; NEON:       [[IF_THEN]]:
-; NEON-NEXT:    [[DIV:%.*]] = sdiv i32 42, [[LD_A]]
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[SELECT_DATA]] = phi i32 [ [[DIV]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
-; NEON-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-; SVE-LABEL: define i32 @simple_csa_int_divide(
-; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
-; SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; SVE:       [[VECTOR_PH]]:
-; SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
-; SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SVE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DEFAULT_VAL]], i64 0
-; SVE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
-; SVE:       [[VECTOR_BODY]]:
-; SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
-; SVE-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; SVE-NEXT:    [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> splat (i32 1)
-; SVE-NEXT:    [[TMP8:%.*]] = sdiv <vscale x 4 x i32> splat (i32 42), [[TMP7]]
-; SVE-NEXT:    [[TMP9:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
-; SVE-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SVE-NEXT:    [[TMP11]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
-; SVE-NEXT:    [[TMP12]] = select i1 [[TMP10]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]]
-; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; SVE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; SVE:       [[MIDDLE_BLOCK]]:
-; SVE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
-; SVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i1> [[TMP11]], i32 [[TMP14]])
-; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; SVE:       [[SCALAR_PH]]:
-; SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[DEFAULT_VAL]], %[[ENTRY]] ]
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; SVE-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; SVE-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; SVE-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; SVE-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; SVE-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; SVE:       [[IF_THEN]]:
-; SVE-NEXT:    [[DIV:%.*]] = sdiv i32 42, [[LD_A]]
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[SELECT_DATA]] = phi i32 [ [[DIV]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
-; SVE-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
-  %data.phi = phi i32 [ %default_val, %entry ], [ %select.data, %latch ]
-  %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
-  %ld.a = load i32, ptr %a.addr, align 4
-  %if.cond = icmp sgt i32 %ld.a, %threshold
-  br i1 %if.cond, label %if.then, label %latch
-
-if.then:
-  %div = sdiv i32 42, %ld.a
-  br label %latch
-
-latch:
-  %select.data = phi i32 [ %div, %if.then ], [ %data.phi, %loop ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exit.cmp = icmp eq i64 %iv.next, %N
-  br i1 %exit.cmp, label %exit, label %loop
-
-exit:
-  ret i32 %select.data
-}
-
-; This test is derived from a loop like:
-;  int result = default_val;
-;  for (int i = 0; i < N; ++i) {
-;    if (i < 100) {
-;      if (a[i] > threshold)
-;        result = b[i];
-;    }
-;  }
-;  return result;
-define i32 @csa_load_nested_ifs(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
-; NEON-LABEL: define i32 @csa_load_nested_ifs(
-; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; NEON-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; NEON-NEXT:    [[OUTER_IF_COND:%.*]] = icmp ult i64 [[IV]], 100
-; NEON-NEXT:    br i1 [[OUTER_IF_COND]], label %[[OUTER_IF_THEN:.*]], label %[[LATCH]]
-; NEON:       [[OUTER_IF_THEN]]:
-; NEON-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; NEON-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; NEON-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; NEON-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; NEON:       [[IF_THEN]]:
-; NEON-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; NEON-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ], [ [[DATA_PHI]], %[[OUTER_IF_THEN]] ]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
-; NEON-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-; SVE-LABEL: define i32 @csa_load_nested_ifs(
-; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
-; SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; SVE:       [[VECTOR_PH]]:
-; SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[THRESHOLD]], i64 0
-; SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SVE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DEFAULT_VAL]], i64 0
-; SVE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; SVE-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; SVE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
-; SVE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
-; SVE:       [[VECTOR_BODY]]:
-; SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP5:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP6:%.*]] = icmp ult <vscale x 4 x i64> [[VEC_IND]], splat (i64 100)
-; SVE-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
-; SVE-NEXT:    [[TMP8:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; SVE-NEXT:    [[TMP9:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> zeroinitializer
-; SVE-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
-; SVE-NEXT:    [[TMP11:%.*]] = freeze <vscale x 4 x i1> [[TMP9]]
-; SVE-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP11]])
-; SVE-NEXT:    [[TMP13]] = select i1 [[TMP12]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP5]]
-; SVE-NEXT:    [[TMP14]] = select i1 [[TMP12]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD5]], <vscale x 4 x i32> [[VEC_PHI]]
-; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; SVE-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
-; SVE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; SVE:       [[MIDDLE_BLOCK]]:
-; SVE-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT2]], i32 0
-; SVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x i1> [[TMP13]], i32 [[TMP16]])
-; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; SVE:       [[SCALAR_PH]]:
-; SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ [[DEFAULT_VAL]], %[[ENTRY]] ]
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; SVE-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; SVE-NEXT:    [[OUTER_IF_COND:%.*]] = icmp ult i64 [[IV]], 100
-; SVE-NEXT:    br i1 [[OUTER_IF_COND]], label %[[OUTER_IF_THEN:.*]], label %[[LATCH]]
-; SVE:       [[OUTER_IF_THEN]]:
-; SVE-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; SVE-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; SVE-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; SVE-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; SVE:       [[IF_THEN]]:
-; SVE-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; SVE-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ], [ [[DATA_PHI]], %[[OUTER_IF_THEN]] ]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
-; SVE-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
-  %data.phi = phi i32 [ %default_val, %entry ], [ %select.data, %latch ]
-  %outer.if.cond = icmp ult i64 %iv, 100
-  br i1 %outer.if.cond, label %outer.if.then, label %latch
-
-outer.if.then:
-  %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
-  %ld.a = load i32, ptr %a.addr, align 4
-  %if.cond = icmp sgt i32 %ld.a, %threshold
-  br i1 %if.cond, label %if.then, label %latch
-
-if.then:
-  %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
-  %ld.b = load i32, ptr %b.addr, align 4
-  br label %latch
-
-latch:
-  %select.data = phi i32 [ %ld.b, %if.then ], [ %data.phi, %loop ], [ %data.phi, %outer.if.then ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exit.cmp = icmp eq i64 %iv.next, %N
-  br i1 %exit.cmp, label %exit, label %loop
-
-exit:
-  ret i32 %select.data
-}
-
-; This test is derived from an (unsupported) loop like:
-;  int result = default_val;
-;  for (int i = 0; i < N; ++i) {
-;    if (a[i] > threshold)
-;      result = b[i];
-;    if (a[i] > 100)
-;      result = c[i];
-;  }
-;  return result;
-define i32 @chained_csa_int_load(ptr noalias %a, ptr noalias %b,  ptr noalias %c, i32 %default_val, i64 %N, i32 %threshold) {
-; NEON-LABEL: define i32 @chained_csa_int_load(
-; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; NEON-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA_1:%.*]], %[[LATCH]] ]
-; NEON-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; NEON-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; NEON-NEXT:    [[IF_COND_0:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; NEON-NEXT:    br i1 [[IF_COND_0]], label %[[IF_THEN_0:.*]], label %[[IF_END_0:.*]]
-; NEON:       [[IF_THEN_0]]:
-; NEON-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; NEON-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; NEON-NEXT:    br label %[[IF_END_0]]
-; NEON:       [[IF_END_0]]:
-; NEON-NEXT:    [[SELECT_DATA_0:%.*]] = phi i32 [ [[LD_B]], %[[IF_THEN_0]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; NEON-NEXT:    [[IF_COND_1:%.*]] = icmp sgt i32 [[LD_A]], 100
-; NEON-NEXT:    br i1 [[IF_COND_1]], label %[[IF_THEN_1:.*]], label %[[LATCH]]
-; NEON:       [[IF_THEN_1]]:
-; NEON-NEXT:    [[C_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[C]], i64 [[IV]]
-; NEON-NEXT:    [[LD_C:%.*]] = load i32, ptr [[C_ADDR]], align 4
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[SELECT_DATA_1]] = phi i32 [ [[LD_C]], %[[IF_THEN_1]] ], [ [[SELECT_DATA_0]], %[[IF_END_0]] ]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[SELECT_DATA_1_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA_1]], %[[LATCH]] ]
-; NEON-NEXT:    ret i32 [[SELECT_DATA_1_LCSSA]]
-;
-; SVE-LABEL: define i32 @chained_csa_int_load(
-; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; SVE-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA_1:%.*]], %[[LATCH]] ]
-; SVE-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; SVE-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; SVE-NEXT:    [[IF_COND_0:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; SVE-NEXT:    br i1 [[IF_COND_0]], label %[[IF_THEN_0:.*]], label %[[IF_END_0:.*]]
-; SVE:       [[IF_THEN_0]]:
-; SVE-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; SVE-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; SVE-NEXT:    br label %[[IF_END_0]]
-; SVE:       [[IF_END_0]]:
-; SVE-NEXT:    [[SELECT_DATA_0:%.*]] = phi i32 [ [[LD_B]], %[[IF_THEN_0]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; SVE-NEXT:    [[IF_COND_1:%.*]] = icmp sgt i32 [[LD_A]], 100
-; SVE-NEXT:    br i1 [[IF_COND_1]], label %[[IF_THEN_1:.*]], label %[[LATCH]]
-; SVE:       [[IF_THEN_1]]:
-; SVE-NEXT:    [[C_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[C]], i64 [[IV]]
-; SVE-NEXT:    [[LD_C:%.*]] = load i32, ptr [[C_ADDR]], align 4
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[SELECT_DATA_1]] = phi i32 [ [[LD_C]], %[[IF_THEN_1]] ], [ [[SELECT_DATA_0]], %[[IF_END_0]] ]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[SELECT_DATA_1_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA_1]], %[[LATCH]] ]
-; SVE-NEXT:    ret i32 [[SELECT_DATA_1_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
-  %data.phi = phi i32 [ %default_val, %entry ], [ %select.data.1, %latch ]
-  %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
-  %ld.a = load i32, ptr %a.addr, align 4
-  %if.cond.0 = icmp sgt i32 %ld.a, %threshold
-  br i1 %if.cond.0, label %if.then.0, label %if.end.0
-
-if.then.0:
-  %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
-  %ld.b = load i32, ptr %b.addr, align 4
-  br label %if.end.0
-
-if.end.0:
-  %select.data.0 = phi i32 [ %ld.b, %if.then.0 ], [ %data.phi, %loop ]
-  %if.cond.1 = icmp sgt i32 %ld.a, 100
-  br i1 %if.cond.1, label %if.then.1, label %latch
-
-if.then.1:
-  %c.addr = getelementptr inbounds nuw i32, ptr %c, i64 %iv
-  %ld.c = load i32, ptr %c.addr, align 4
-  br label %latch
-
-latch:
-  %select.data.1 = phi i32 [ %ld.c, %if.then.1 ], [ %select.data.0, %if.end.0 ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exit.cmp = icmp eq i64 %iv.next, %N
-  br i1 %exit.cmp, label %exit, label %loop
-
-exit:
-  ret i32 %select.data.1
-}
-
-; This test is derived from an (unsupported) loop like:
-;  int result = default_val;
-;  for (int i = 0; i < N; ++i) {
-;    if (a[i] > threshold)
-;      result = b[i];
-;    res[i] = result;
-;  }
-;  return result;
-define i32 @simple_csa_int_load_multi_user(ptr noalias %a, ptr noalias %b, ptr noalias %results, i32 %default_val, i64 %N, i32 %threshold) {
-; NEON-LABEL: define i32 @simple_csa_int_load_multi_user(
-; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[RESULTS:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; NEON-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; NEON-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; NEON-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; NEON-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; NEON-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; NEON:       [[IF_THEN]]:
-; NEON-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; NEON-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; NEON-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]]
-; NEON-NEXT:    store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
-; NEON-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-; SVE-LABEL: define i32 @simple_csa_int_load_multi_user(
-; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[RESULTS:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; SVE-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; SVE-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; SVE-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; SVE-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; SVE-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; SVE:       [[IF_THEN]]:
-; SVE-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; SVE-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; SVE-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]]
-; SVE-NEXT:    store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
-; SVE-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
-  %data.phi = phi i32 [ %default_val, %entry ], [ %select.data, %latch ]
-  %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
-  %ld.a = load i32, ptr %a.addr, align 4
-  %if.cond = icmp sgt i32 %ld.a, %threshold
-  br i1 %if.cond, label %if.then, label %latch
-
-if.then:
-  %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
-  %ld.b = load i32, ptr %b.addr, align 4
-  br label %latch
-
-latch:
-  %select.data = phi i32 [ %ld.b, %if.then ], [ %data.phi, %loop ]
-  %res.addr = getelementptr inbounds i32, ptr %results, i64 %iv
-  store i32 %select.data, ptr %res.addr, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exit.cmp = icmp eq i64 %iv.next, %N
-  br i1 %exit.cmp, label %exit, label %loop
-
-exit:
-  ret i32 %select.data
-}
-
-; This tests the following unsupported case.
-;  int result = default_val;
-;  int foo = 0;
-;  for (int i = 0; i < N; ++i) {
-;    if (a[i] > threshold)
-;      result = b[i];
-;    foo += result; // Fails as CSA has an extra user.
-;  }
-;  return result ^ foo;
-define i32 @csa_load_used_in_add_reduction(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
-; NEON-LABEL: define i32 @csa_load_used_in_add_reduction(
-; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; NEON-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; NEON-NEXT:    [[SUM_PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[LATCH]] ]
-; NEON-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; NEON-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; NEON-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; NEON-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; NEON:       [[IF_THEN]]:
-; NEON-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; NEON-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; NEON-NEXT:    [[ADD]] = add nsw i32 [[SELECT_DATA]], [[SUM_PHI]]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
-; NEON-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-; SVE-LABEL: define i32 @csa_load_used_in_add_reduction(
-; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; SVE-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LATCH]] ]
-; SVE-NEXT:    [[SUM_PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[LATCH]] ]
-; SVE-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; SVE-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; SVE-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; SVE-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; SVE:       [[IF_THEN]]:
-; SVE-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; SVE-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[SELECT_DATA]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; SVE-NEXT:    [[ADD]] = add nsw i32 [[SELECT_DATA]], [[SUM_PHI]]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LATCH]] ]
-; SVE-NEXT:    ret i32 [[SELECT_DATA_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
-  %data.phi = phi i32 [ %default_val, %entry ], [ %select.data, %latch ]
-  %sum.phi = phi i32 [ 0, %entry ], [ %add, %latch ]
-  %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
-  %ld.a = load i32, ptr %a.addr, align 4
-  %if.cond = icmp sgt i32 %ld.a, %threshold
-  br i1 %if.cond, label %if.then, label %latch
-
-if.then:
-  %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
-  %ld.b = load i32, ptr %b.addr, align 4
-  br label %latch
-
-latch:
-  %select.data = phi i32 [ %ld.b, %if.then ], [ %data.phi, %loop ]
-  %add = add nsw i32 %select.data, %sum.phi
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exit.cmp = icmp eq i64 %iv.next, %N
-  br i1 %exit.cmp, label %exit, label %loop
-
-exit:
-  ret i32 %select.data
-}
-
-; This test case is from the following unsupported loop (where lastsum is using
-; another reduction value).
-;  int lastsum = 0, sum = 0;
-;  for (int i=0; i<N; ++i) {
-;    if (mask[i])
-;      lastsum = sum + b[i];
-;    sum  += src[i];
-;  }
-;  return lastsum;
-define i32 @csa_find_last_phi_use_other_reduction(ptr noalias %mask, ptr noalias %src, ptr noalias %b, i64 %N) {
-; NEON-LABEL: define i32 @csa_find_last_phi_use_other_reduction(
-; NEON-SAME: ptr noalias [[MASK:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; NEON-NEXT:    [[SUM:%.*]] = phi i32 [ [[ADD_SRC:%.*]], %[[LATCH]] ], [ 0, %[[ENTRY]] ]
-; NEON-NEXT:    [[LASTSUM_0:%.*]] = phi i32 [ [[LASTSUM_1:%.*]], %[[LATCH]] ], [ 0, %[[ENTRY]] ]
-; NEON-NEXT:    [[MASK_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[MASK]], i64 [[IV]]
-; NEON-NEXT:    [[MASK_VAL:%.*]] = load i32, ptr [[MASK_PTR]], align 4
-; NEON-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[MASK_VAL]], 0
-; NEON-NEXT:    br i1 [[TOBOOL_NOT]], label %[[LATCH]], label %[[COND_ASSIGN:.*]]
-; NEON:       [[COND_ASSIGN]]:
-; NEON-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[IV]]
-; NEON-NEXT:    [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; NEON-NEXT:    [[ZEXT_B:%.*]] = zext i8 [[B_VAL]] to i32
-; NEON-NEXT:    [[ADD_B:%.*]] = add nuw nsw i32 [[SUM]], [[ZEXT_B]]
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[LASTSUM_1]] = phi i32 [ [[ADD_B]], %[[COND_ASSIGN]] ], [ [[LASTSUM_0]], %[[LOOP]] ]
-; NEON-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[IV]]
-; NEON-NEXT:    [[SRC_VAL:%.*]] = load i8, ptr [[SRC_PTR]], align 1
-; NEON-NEXT:    [[ZEXT_SRC:%.*]] = zext i8 [[SRC_VAL]] to i32
-; NEON-NEXT:    [[ADD_SRC]] = add nuw nsw i32 [[SUM]], [[ZEXT_SRC]]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[LASTSUM_1_LCSSA:%.*]] = phi i32 [ [[LASTSUM_1]], %[[LATCH]] ]
-; NEON-NEXT:    ret i32 [[LASTSUM_1_LCSSA]]
-;
-; SVE-LABEL: define i32 @csa_find_last_phi_use_other_reduction(
-; SVE-SAME: ptr noalias [[MASK:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    [[SUM:%.*]] = phi i32 [ [[ADD_SRC:%.*]], %[[LATCH]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    [[LASTSUM_0:%.*]] = phi i32 [ [[LASTSUM_1:%.*]], %[[LATCH]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    [[MASK_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[MASK]], i64 [[IV]]
-; SVE-NEXT:    [[MASK_VAL:%.*]] = load i32, ptr [[MASK_PTR]], align 4
-; SVE-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[MASK_VAL]], 0
-; SVE-NEXT:    br i1 [[TOBOOL_NOT]], label %[[LATCH]], label %[[COND_ASSIGN:.*]]
-; SVE:       [[COND_ASSIGN]]:
-; SVE-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[IV]]
-; SVE-NEXT:    [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; SVE-NEXT:    [[ZEXT_B:%.*]] = zext i8 [[B_VAL]] to i32
-; SVE-NEXT:    [[ADD_B:%.*]] = add nuw nsw i32 [[SUM]], [[ZEXT_B]]
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[LASTSUM_1]] = phi i32 [ [[ADD_B]], %[[COND_ASSIGN]] ], [ [[LASTSUM_0]], %[[LOOP]] ]
-; SVE-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 [[IV]]
-; SVE-NEXT:    [[SRC_VAL:%.*]] = load i8, ptr [[SRC_PTR]], align 1
-; SVE-NEXT:    [[ZEXT_SRC:%.*]] = zext i8 [[SRC_VAL]] to i32
-; SVE-NEXT:    [[ADD_SRC]] = add nuw nsw i32 [[SUM]], [[ZEXT_SRC]]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[LASTSUM_1_LCSSA:%.*]] = phi i32 [ [[LASTSUM_1]], %[[LATCH]] ]
-; SVE-NEXT:    ret i32 [[LASTSUM_1_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %latch ], [ 0, %entry ]
-  %sum = phi i32 [ %add.src, %latch ], [ 0, %entry ]
-  %lastsum.0 = phi i32 [ %lastsum.1, %latch ], [ 0, %entry ]
-  %mask.ptr = getelementptr inbounds nuw i32, ptr %mask, i64 %iv
-  %mask.val = load i32, ptr %mask.ptr, align 4
-  %tobool.not = icmp eq i32 %mask.val, 0
-  br i1 %tobool.not, label %latch, label %cond.assign
-
-cond.assign:
-  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv
-  %b.val = load i8, ptr %b.ptr, align 1
-  %zext.b = zext i8 %b.val to i32
-  %add.b = add nuw nsw i32 %sum, %zext.b
-  br label %latch
-
-latch:
-  %lastsum.1 = phi i32 [ %add.b, %cond.assign ], [ %lastsum.0, %loop ]
-  %src.ptr = getelementptr inbounds nuw i8, ptr %src, i64 %iv
-  %src.val = load i8, ptr %src.ptr, align 1
-  %zext.src = zext i8 %src.val to i32
-  %add.src = add nuw nsw i32 %sum, %zext.src
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret i32 %lastsum.1
-}
-
-; This test is based on the following loop:
-;  int lastsum = 0;
-;  for (int i=0; i<N; ++i) {
-;    if (mask[i])
-;      lastsum = i + b[i];
-;  }
-;  return lastsum;
-define i32 @csa_find_last_phi_use_iv(ptr noalias %mask, ptr noalias %src, ptr noalias %b, i64 %N) {
-; NEON-LABEL: define i32 @csa_find_last_phi_use_iv(
-; NEON-SAME: ptr noalias [[MASK:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; NEON-NEXT:    [[LASTSUM_0:%.*]] = phi i32 [ [[LASTSUM_1:%.*]], %[[LATCH]] ], [ 0, %[[ENTRY]] ]
-; NEON-NEXT:    [[MASK_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[MASK]], i64 [[IV]]
-; NEON-NEXT:    [[MASK_VAL:%.*]] = load i32, ptr [[MASK_PTR]], align 4
-; NEON-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[MASK_VAL]], 0
-; NEON-NEXT:    br i1 [[TOBOOL_NOT]], label %[[LATCH]], label %[[COND_ASSIGN:.*]]
-; NEON:       [[COND_ASSIGN]]:
-; NEON-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[IV]]
-; NEON-NEXT:    [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; NEON-NEXT:    [[ZEXT_B:%.*]] = zext i8 [[B_VAL]] to i32
-; NEON-NEXT:    [[TRUNC_IV:%.*]] = trunc nuw nsw i64 [[IV]] to i32
-; NEON-NEXT:    [[ADD_IV:%.*]] = add nuw nsw i32 [[TRUNC_IV]], [[ZEXT_B]]
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[LASTSUM_1]] = phi i32 [ [[ADD_IV]], %[[COND_ASSIGN]] ], [ [[LASTSUM_0]], %[[LOOP]] ]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[LASTSUM_1_LCSSA:%.*]] = phi i32 [ [[LASTSUM_1]], %[[LATCH]] ]
-; NEON-NEXT:    ret i32 [[LASTSUM_1_LCSSA]]
-;
-; SVE-LABEL: define i32 @csa_find_last_phi_use_iv(
-; SVE-SAME: ptr noalias [[MASK:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
-; SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; SVE:       [[VECTOR_PH]]:
-; SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
-; SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SVE-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; SVE-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
-; SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
-; SVE:       [[VECTOR_BODY]]:
-; SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP6:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[MASK]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i32>, ptr [[TMP7]], align 4
-; SVE-NEXT:    [[TMP8:%.*]] = icmp ne <vscale x 16 x i32> [[WIDE_LOAD]], zeroinitializer
-; SVE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP9]], <vscale x 16 x i1> [[TMP8]], <vscale x 16 x i8> poison)
-; SVE-NEXT:    [[TMP10:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
-; SVE-NEXT:    [[TMP11:%.*]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[TMP10]]
-; SVE-NEXT:    [[TMP12:%.*]] = freeze <vscale x 16 x i1> [[TMP8]]
-; SVE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP12]])
-; SVE-NEXT:    [[TMP14]] = select i1 [[TMP13]], <vscale x 16 x i1> [[TMP8]], <vscale x 16 x i1> [[TMP6]]
-; SVE-NEXT:    [[TMP15]] = select i1 [[TMP13]], <vscale x 16 x i32> [[TMP11]], <vscale x 16 x i32> [[VEC_PHI]]
-; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; SVE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SVE-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; SVE:       [[MIDDLE_BLOCK]]:
-; SVE-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv16i32(<vscale x 16 x i32> [[TMP15]], <vscale x 16 x i1> [[TMP14]], i32 0)
-; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; SVE:       [[SCALAR_PH]]:
-; SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; SVE-NEXT:    [[LASTSUM_0:%.*]] = phi i32 [ [[LASTSUM_1:%.*]], %[[LATCH]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
-; SVE-NEXT:    [[MASK_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[MASK]], i64 [[IV]]
-; SVE-NEXT:    [[MASK_VAL:%.*]] = load i32, ptr [[MASK_PTR]], align 4
-; SVE-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[MASK_VAL]], 0
-; SVE-NEXT:    br i1 [[TOBOOL_NOT]], label %[[LATCH]], label %[[COND_ASSIGN:.*]]
-; SVE:       [[COND_ASSIGN]]:
-; SVE-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[IV]]
-; SVE-NEXT:    [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; SVE-NEXT:    [[ZEXT_B:%.*]] = zext i8 [[B_VAL]] to i32
-; SVE-NEXT:    [[TRUNC_IV:%.*]] = trunc nuw nsw i64 [[IV]] to i32
-; SVE-NEXT:    [[ADD_IV:%.*]] = add nuw nsw i32 [[TRUNC_IV]], [[ZEXT_B]]
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[LASTSUM_1]] = phi i32 [ [[ADD_IV]], %[[COND_ASSIGN]] ], [ [[LASTSUM_0]], %[[LOOP]] ]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[LASTSUM_1_LCSSA:%.*]] = phi i32 [ [[LASTSUM_1]], %[[LATCH]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
-; SVE-NEXT:    ret i32 [[LASTSUM_1_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %latch ], [ 0, %entry ]
-  %lastsum.0 = phi i32 [ %lastsum.1, %latch ], [ 0, %entry ]
-  %mask.ptr = getelementptr inbounds nuw i32, ptr %mask, i64 %iv
-  %mask.val = load i32, ptr %mask.ptr, align 4
-  %tobool.not = icmp eq i32 %mask.val, 0
-  br i1 %tobool.not, label %latch, label %cond.assign
-
-cond.assign:
-  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv
-  %b.val = load i8, ptr %b.ptr, align 1
-  %zext.b = zext i8 %b.val to i32
-  %trunc.iv = trunc nuw nsw i64 %iv to i32
-  %add.iv = add nuw nsw i32 %trunc.iv, %zext.b
-  br label %latch
-
-latch:
-  %lastsum.1 = phi i32 [ %add.iv, %cond.assign ], [ %lastsum.0, %loop ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  ret i32 %lastsum.1
-}
-
-; This test is based on the following loop:
-;  int last_a = 0;
-;  int last_b = 0;
-;  for (int i=0; i<N; ++i) {
-;    if (mask[i]) {
-;      last_a = a[i];
-;      last_b = b[i];
-;    }
-;  }
-;  return last_a ^ last_b;
-define i32 @csa_multiple_find_last_phi_reductions(ptr noalias %mask, ptr noalias %a, ptr noalias %b, i64 %N) {
-; NEON-LABEL: define i32 @csa_multiple_find_last_phi_reductions(
-; NEON-SAME: ptr noalias [[MASK:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[ENTRY]] ]
-; NEON-NEXT:    [[LAST_A_0:%.*]] = phi i32 [ [[LAST_A_1:%.*]], %[[LATCH]] ], [ 0, %[[ENTRY]] ]
-; NEON-NEXT:    [[LAST_B_0:%.*]] = phi i32 [ [[LAST_B_1:%.*]], %[[LATCH]] ], [ 0, %[[ENTRY]] ]
-; NEON-NEXT:    [[MASK_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[MASK]], i64 [[IV]]
-; NEON-NEXT:    [[MASK_VAL:%.*]] = load i32, ptr [[MASK_PTR]], align 4
-; NEON-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[MASK_VAL]], 0
-; NEON-NEXT:    br i1 [[TOBOOL_NOT]], label %[[LATCH]], label %[[COND_ASSIGN:.*]]
-; NEON:       [[COND_ASSIGN]]:
-; NEON-NEXT:    [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[IV]]
-; NEON-NEXT:    [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; NEON-NEXT:    [[A_ZEXT:%.*]] = zext i8 [[A_VAL]] to i32
-; NEON-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[IV]]
-; NEON-NEXT:    [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; NEON-NEXT:    [[B_ZEXT:%.*]] = zext i8 [[B_VAL]] to i32
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[LAST_A_1]] = phi i32 [ [[A_ZEXT]], %[[COND_ASSIGN]] ], [ [[LAST_A_0]], %[[LOOP]] ]
-; NEON-NEXT:    [[LAST_B_1]] = phi i32 [ [[B_ZEXT]], %[[COND_ASSIGN]] ], [ [[LAST_B_0]], %[[LOOP]] ]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[LAST_A_1_LCSSA:%.*]] = phi i32 [ [[LAST_A_1]], %[[LATCH]] ]
-; NEON-NEXT:    [[LAST_B_1_LCSSA:%.*]] = phi i32 [ [[LAST_B_1]], %[[LATCH]] ]
-; NEON-NEXT:    [[XOR:%.*]] = xor i32 [[LAST_B_1_LCSSA]], [[LAST_A_1_LCSSA]]
-; NEON-NEXT:    ret i32 [[XOR]]
-;
-; SVE-LABEL: define i32 @csa_multiple_find_last_phi_reductions(
-; SVE-SAME: ptr noalias [[MASK:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
-; SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; SVE:       [[VECTOR_PH]]:
-; SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
-; SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
-; SVE:       [[VECTOR_BODY]]:
-; SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP4:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP5:%.*]] = phi <vscale x 16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
-; SVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[MASK]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
-; SVE-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 16 x i32> [[WIDE_LOAD]], zeroinitializer
-; SVE-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP8]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
-; SVE-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
-; SVE-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; SVE-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
-; SVE-NEXT:    [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD2]] to <vscale x 16 x i32>
-; SVE-NEXT:    [[TMP12:%.*]] = freeze <vscale x 16 x i1> [[TMP7]]
-; SVE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP12]])
-; SVE-NEXT:    [[TMP14]] = select i1 [[TMP13]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP4]]
-; SVE-NEXT:    [[TMP15]] = select i1 [[TMP13]], <vscale x 16 x i32> [[TMP9]], <vscale x 16 x i32> [[VEC_PHI]]
-; SVE-NEXT:    [[TMP16]] = select i1 [[TMP13]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP5]]
-; SVE-NEXT:    [[TMP17]] = select i1 [[TMP13]], <vscale x 16 x i32> [[TMP11]], <vscale x 16 x i32> [[VEC_PHI1]]
-; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; SVE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; SVE:       [[MIDDLE_BLOCK]]:
-; SVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv16i32(<vscale x 16 x i32> [[TMP15]], <vscale x 16 x i1> [[TMP14]], i32 0)
-; SVE-NEXT:    [[TMP20:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv16i32(<vscale x 16 x i32> [[TMP17]], <vscale x 16 x i1> [[TMP16]], i32 0)
-; SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; SVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; SVE:       [[SCALAR_PH]]:
-; SVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    [[BC_MERGE_RDX3:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; SVE-NEXT:    [[LAST_A_0:%.*]] = phi i32 [ [[LAST_A_1:%.*]], %[[LATCH]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
-; SVE-NEXT:    [[LAST_B_0:%.*]] = phi i32 [ [[LAST_B_1:%.*]], %[[LATCH]] ], [ [[BC_MERGE_RDX3]], %[[SCALAR_PH]] ]
-; SVE-NEXT:    [[MASK_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[MASK]], i64 [[IV]]
-; SVE-NEXT:    [[MASK_VAL:%.*]] = load i32, ptr [[MASK_PTR]], align 4
-; SVE-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[MASK_VAL]], 0
-; SVE-NEXT:    br i1 [[TOBOOL_NOT]], label %[[LATCH]], label %[[COND_ASSIGN:.*]]
-; SVE:       [[COND_ASSIGN]]:
-; SVE-NEXT:    [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[IV]]
-; SVE-NEXT:    [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1
-; SVE-NEXT:    [[A_ZEXT:%.*]] = zext i8 [[A_VAL]] to i32
-; SVE-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[IV]]
-; SVE-NEXT:    [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1
-; SVE-NEXT:    [[B_ZEXT:%.*]] = zext i8 [[B_VAL]] to i32
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[LAST_A_1]] = phi i32 [ [[A_ZEXT]], %[[COND_ASSIGN]] ], [ [[LAST_A_0]], %[[LOOP]] ]
-; SVE-NEXT:    [[LAST_B_1]] = phi i32 [ [[B_ZEXT]], %[[COND_ASSIGN]] ], [ [[LAST_B_0]], %[[LOOP]] ]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[LAST_A_1_LCSSA:%.*]] = phi i32 [ [[LAST_A_1]], %[[LATCH]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
-; SVE-NEXT:    [[LAST_B_1_LCSSA:%.*]] = phi i32 [ [[LAST_B_1]], %[[LATCH]] ], [ [[TMP20]], %[[MIDDLE_BLOCK]] ]
-; SVE-NEXT:    [[XOR:%.*]] = xor i32 [[LAST_B_1_LCSSA]], [[LAST_A_1_LCSSA]]
-; SVE-NEXT:    ret i32 [[XOR]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ %iv.next, %latch ], [ 0, %entry ]
-  %last_a.0 = phi i32 [ %last_a.1, %latch ], [ 0, %entry ]
-  %last_b.0 = phi i32 [ %last_b.1, %latch ], [ 0, %entry ]
-  %mask.ptr = getelementptr inbounds nuw i32, ptr %mask, i64 %iv
-  %mask.val = load i32, ptr %mask.ptr, align 4
-  %tobool.not = icmp eq i32 %mask.val, 0
-  br i1 %tobool.not, label %latch, label %cond.assign
-
-cond.assign:
-  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %iv
-  %a.val = load i8, ptr %a.ptr, align 1
-  %a.zext = zext i8 %a.val to i32
-  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %iv
-  %b.val = load i8, ptr %b.ptr, align 1
-  %b.zext = zext i8 %b.val to i32
-  br label %latch
-
-latch:
-  %last_a.1 = phi i32 [ %a.zext, %cond.assign ], [ %last_a.0, %loop ]
-  %last_b.1 = phi i32 [ %b.zext, %cond.assign ], [ %last_b.0, %loop ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %loop
-
-exit:
-  %xor = xor i32 %last_b.1, %last_a.1
-  ret i32 %xor
-}
-
-; Negative test for a find-last-like phi fed into a find-last select.
-define i32 @find_last_phi_fed_into_find_last_select(
-; NEON-LABEL: define i32 @find_last_phi_fed_into_find_last_select(
-; NEON-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) {
-; NEON-NEXT:  [[ENTRY:.*]]:
-; NEON-NEXT:    br label %[[LOOP:.*]]
-; NEON:       [[LOOP]]:
-; NEON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; NEON-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA_1:%.*]], %[[LATCH]] ]
-; NEON-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; NEON-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; NEON-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; NEON-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; NEON:       [[IF_THEN]]:
-; NEON-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; NEON-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; NEON-NEXT:    br label %[[LATCH]]
-; NEON:       [[LATCH]]:
-; NEON-NEXT:    [[SELECT_DATA:%.*]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; NEON-NEXT:    [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; NEON-NEXT:    [[LD_C:%.*]] = load i32, ptr [[C_ADDR]], align 4
-; NEON-NEXT:    [[SELECT_CMP:%.*]] = icmp slt i32 [[LD_A]], [[LD_C]]
-; NEON-NEXT:    [[SELECT_DATA_1]] = select i1 [[SELECT_CMP]], i32 [[SELECT_DATA]], i32 [[DATA_PHI]]
-; NEON-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NEON-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NEON-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; NEON:       [[EXIT]]:
-; NEON-NEXT:    [[SELECT_DATA_1_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA_1]], %[[LATCH]] ]
-; NEON-NEXT:    ret i32 [[SELECT_DATA_1_LCSSA]]
-;
-; SVE-LABEL: define i32 @find_last_phi_fed_into_find_last_select(
-; SVE-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i32 [[DEFAULT_VAL:%.*]], i64 [[N:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:  [[ENTRY:.*]]:
-; SVE-NEXT:    br label %[[LOOP:.*]]
-; SVE:       [[LOOP]]:
-; SVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; SVE-NEXT:    [[DATA_PHI:%.*]] = phi i32 [ [[DEFAULT_VAL]], %[[ENTRY]] ], [ [[SELECT_DATA_1:%.*]], %[[LATCH]] ]
-; SVE-NEXT:    [[A_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
-; SVE-NEXT:    [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; SVE-NEXT:    [[IF_COND:%.*]] = icmp sgt i32 [[LD_A]], [[THRESHOLD]]
-; SVE-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[LATCH]]
-; SVE:       [[IF_THEN]]:
-; SVE-NEXT:    [[B_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
-; SVE-NEXT:    [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4
-; SVE-NEXT:    br label %[[LATCH]]
-; SVE:       [[LATCH]]:
-; SVE-NEXT:    [[SELECT_DATA:%.*]] = phi i32 [ [[LD_B]], %[[IF_THEN]] ], [ [[DATA_PHI]], %[[LOOP]] ]
-; SVE-NEXT:    [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; SVE-NEXT:    [[LD_C:%.*]] = load i32, ptr [[C_ADDR]], align 4
-; SVE-NEXT:    [[SELECT_CMP:%.*]] = icmp slt i32 [[LD_A]], [[LD_C]]
-; SVE-NEXT:    [[SELECT_DATA_1]] = select i1 [[SELECT_CMP]], i32 [[SELECT_DATA]], i32 [[DATA_PHI]]
-; SVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SVE-NEXT:    [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; SVE-NEXT:    br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]]
-; SVE:       [[EXIT]]:
-; SVE-NEXT:    [[SELECT_DATA_1_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA_1]], %[[LATCH]] ]
-; SVE-NEXT:    ret i32 [[SELECT_DATA_1_LCSSA]]
-;
-  ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %default_val, i64 %N, i32 %threshold) {
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
-  %data.phi = phi i32 [ %default_val, %entry ], [ %select.data.1, %latch ]
-  %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
-  %ld.a = load i32, ptr %a.addr, align 4
-  %if.cond = icmp sgt i32 %ld.a, %threshold
-  br i1 %if.cond, label %if.then, label %latch
-
-if.then:
-  %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
-  %ld.b = load i32, ptr %b.addr, align 4
-  br label %latch
-
-latch:
-  %select.data = phi i32 [ %ld.b, %if.then ], [ %data.phi, %loop ]
-  %c.addr = getelementptr inbounds i32, ptr %c, i64 %iv
-  %ld.c = load i32, ptr %c.addr, align 4
-  %select.cmp = icmp slt i32 %ld.a, %ld.c
-  %select.data.1 = select i1 %select.cmp, i32 %select.data, i32 %data.phi
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exit.cmp = icmp eq i64 %iv.next, %N
-  br i1 %exit.cmp, label %exit, label %loop
-
-exit:
-  ret i32 %select.data.1
-}
-
 !1 = distinct !{!1, !2, !3}
 !2 = !{!"llvm.loop.interleave.count", i32 2}
 !3 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index bca0ed8ba7344..06fa33a457bea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -119,6 +119,15 @@
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NonReductionValueUsedOutsideLoop
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          value that could not be identified as reduction is used outside the loop
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
 ; YAML-NEXT: Name:            CantVectorizeLibcall
 ; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 29, Column: 11 }
 ; YAML-NEXT: Function:        test_multiple_failures
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
index cff3740f97f9f..2487914639fe9 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll
@@ -76,103 +76,3 @@ loop:
 exit:
   ret i32 %select.data
 }
-
-; This function is derived from the following C program:
-; int simple_csa_int_load(int* a, int* b, int default_val, int N, int threshold) {
-;   int result = default_val;
-;   for (int i = 0; i < N; ++i)
-;     if (a[i] > threshold)
-;       result = b[i];
-;   return result;
-; }
-define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val, i64 %N, i32 %threshold) {
-; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
-; CHECK-NEXT: Live-in vp<[[VF:%.*]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.*]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VECTC:%.*]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<[[ORIGTC:%.*]]> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<[[CIV:%.*]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEXNEXT]]>
-; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[DATAPHI:%.*]]> = phi ir<%default_val>, vp<[[DATASELECT:%.*]]>
-; CHECK-NEXT:     WIDEN-PHI vp<[[MASKPHI:%.*]]> = phi [ ir<false>, vector.ph ], [ vp<[[MASKSELECT:%.*]]>, if.then.0 ]
-; CHECK-NEXT:     vp<[[STEPS_A:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]>
-; CHECK-NEXT:     CLONE ir<[[A_ADDR:%.*]]> = getelementptr inbounds nuw ir<%a>, vp<[[STEPS_A]]>
-; CHECK-NEXT:     vp<[[A_VPTR:%.*]]> = vector-pointer inbounds nuw ir<[[A_ADDR]]>
-; CHECK-NEXT:     WIDEN ir<[[LD_A:%.*]]> = load vp<[[A_VPTR]]>
-; CHECK-NEXT:     WIDEN ir<[[IF_COND:%.*]]> = icmp sgt ir<[[LD_A]]>, ir<%threshold>
-; CHECK-NEXT:   Successor(s): pred.load
-; CHECK-EMPTY:
-; CHECK-NEXT:  <xVFxUF> pred.load: {
-; CHECK-NEXT:    pred.load.entry:
-; CHECK-NEXT:      BRANCH-ON-MASK ir<[[IF_COND]]>
-; CHECK-NEXT:    Successor(s): pred.load.if, pred.load.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:    pred.load.if:
-; CHECK-NEXT:      vp<[[STEPS_B:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]>
-; CHECK-NEXT:      REPLICATE ir<[[B_ADDR:%.*]]> = getelementptr inbounds nuw ir<%b>, vp<[[STEPS_B]]>
-; CHECK-NEXT:      REPLICATE ir<[[LD_B:%.*]]> = load ir<[[B_ADDR]]> (S->V)
-; CHECK-NEXT:    Successor(s): pred.load.continue
-; CHECK-EMPTY:
-; CHECK-NEXT:    pred.load.continue:
-; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<[[B_VEC:%.*]]> = ir<[[LD_B]]>
-; CHECK-NEXT:    No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): if.then.0
-; CHECK-EMPTY:
-; CHECK-NEXT:   if.then.0:
-; CHECK-NEXT:     EMIT vp<[[ANYOF:%.*]]> = any-of ir<[[IF_COND]]>
-; CHECK-NEXT:     EMIT vp<[[MASKSELECT]]> = select vp<[[ANYOF]]>, ir<[[IF_COND]]>, vp<[[MASKPHI]]>
-; CHECK-NEXT:     EMIT vp<[[DATASELECT]]> = select vp<[[ANYOF]]>, vp<[[B_VEC]]>, ir<[[DATAPHI]]>
-; CHECK-NEXT:     EMIT vp<[[INDEXNEXT:%.*]]> = add nuw vp<[[CIV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:     EMIT branch-on-count vp<[[INDEXNEXT]]>, vp<[[VECTC]]>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[EXTRACTLAST:%.*]]> = extract-last-active vp<[[DATASELECT]]>, vp<[[MASKSELECT]]>, ir<%default_val>
-; CHECK-NEXT:   EMIT vp<[[TCCMP:%.*]]> = icmp eq ir<[[ORIGTC]]>, vp<[[VECTC]]>
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[TCCMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %select.data.lcssa = phi i32 [ %select.data, %latch ] (extra operand: vp<[[EXTRACTLAST]]> from middle.block)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:   EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<%default_val>, ir-bb<entry> ]
-; CHECK-NEXT: Successor(s): ir-bb<loop>
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
-  %data.phi = phi i32 [ %default_val, %entry ], [ %select.data, %latch ]
-  %a.addr = getelementptr inbounds nuw i32, ptr %a, i64 %iv
-  %ld.a = load i32, ptr %a.addr, align 4
-  %if.cond = icmp sgt i32 %ld.a, %threshold
-  br i1 %if.cond, label %if.then, label %latch
-
-if.then:
-  %b.addr = getelementptr inbounds nuw i32, ptr %b, i64 %iv
-  %ld.b = load i32, ptr %b.addr, align 4
-  br label %latch
-
-latch:
-  %select.data = phi i32 [ %ld.b, %if.then ], [ %data.phi, %loop ]
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exit.cmp = icmp eq i64 %iv.next, %N
-  br i1 %exit.cmp, label %exit, label %loop
-
-exit:
-  ret i32 %select.data
-}
diff --git a/llvm/unittests/Analysis/IVDescriptorsTest.cpp b/llvm/unittests/Analysis/IVDescriptorsTest.cpp
index 400e90d2e52bf..453800abf9cab 100644
--- a/llvm/unittests/Analysis/IVDescriptorsTest.cpp
+++ b/llvm/unittests/Analysis/IVDescriptorsTest.cpp
@@ -259,82 +259,3 @@ for.end:
         EXPECT_EQ(Kind, RecurKind::FMax);
       });
 }
-
-TEST(IVDescriptorsTest, UnsupportedFindLastPhi) {
-  // Parse the module.
-  LLVMContext Context;
-
-  // This is a constructed example for the test, but it's based on a loop like:
-  //  int result = default_val;
-  //  for (int i = 0; i < N; ++i) {
-  //    if (cond_0) {
-  //      if (cond_1)
-  //        result = v1;
-  //    } else {
-  //      if (cond_1)
-  //        result = v2;
-  //    }
-  //  }
-  //  return result;
-  std::unique_ptr<Module> M = parseIR(Context, R"(
-    define i64 @unsupported_find_last_phi(
-      i64 %v1, i64 %v2, i64 %default_val, i64 %N, i1 %cond.0, i1 %cond.1
-    ) {
-      entry:
-        br label %loop
-
-      loop:
-        %result.0 = phi i64 [ %default_val, %entry ], [ %result.3, %for.inc ]
-        %idx = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
-        br i1 %cond.0, label %outer.if.then, label %if.else
-
-      outer.if.then:
-        br i1 %cond.1, label %inner.if.then.0, label %inner.if.end.0
-
-      inner.if.then.0:
-        br label %inner.if.end.0
-
-      inner.if.end.0:
-        %result.1 = phi i64
-          [ %v1, %inner.if.then.0 ], [ %result.0, %outer.if.then ]
-        br label %for.inc
-
-      if.else:
-        br i1 %cond.1, label %inner.if.then.1, label %inner.if.end.1
-
-      inner.if.then.1:
-        br label %inner.if.end.1
-
-      inner.if.end.1:
-        %result.2 = phi i64 [ %v2, %inner.if.then.1 ], [ %result.0, %if.else ]
-        br label %for.inc
-
-      for.inc:
-        %result.3 = phi i64
-          [ %result.1, %inner.if.end.0 ], [ %result.2, %inner.if.end.1 ]
-        %inc = add nsw i64 %idx, 1
-        %cmp = icmp slt i64 %idx, %N
-        br i1 %cmp, label  %loop, label %exit
-
-      exit:
-        ret i64 %result.3
-    })");
-
-  runWithLoopInfoAndSE(*M, "unsupported_find_last_phi",
-                       [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
-                         Function::iterator FI = F.begin();
-                         // First basic block is entry - skip it.
-                         BasicBlock *Header = &*(++FI);
-                         assert(Header->getName() == "loop");
-                         Loop *L = LI.getLoopFor(Header);
-                         EXPECT_NE(L, nullptr);
-                         BasicBlock::iterator BBI = Header->begin();
-                         PHINode *Phi = dyn_cast<PHINode>(&*BBI);
-                         EXPECT_NE(Phi, nullptr);
-                         EXPECT_EQ(Phi->getName(), "result.0");
-                         RecurrenceDescriptor Rdx;
-                         bool IsRdxPhi =
-                             RecurrenceDescriptor::isReductionPHI(Phi, L, Rdx);
-                         EXPECT_FALSE(IsRdxPhi);
-                       });
-}

>From 82e01a9237cd0c820f99e8716d13ba839defffe7 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi at microsoft.com>
Date: Fri, 6 Feb 2026 16:25:20 -0500
Subject: [PATCH 05/13] [HLSL] Represent Matrix as arrays of vectors in memory
 (#179861)

fixes https://github.com/llvm/llvm-project/issues/179859

For matrix types we need to check the language mode so we can change the
matrix memory layout to arrays of vectors. To make this play nice with
how the rest of clang treats matrices we need to modify the
MaybeConvertMatrixAddress and the CreateMemTemp function to know how to
reconstruct a flattened vector.

Rest of changes is just test updates.
---
 clang/lib/CodeGen/CGExpr.cpp                  | 20 +++++++++---
 clang/lib/CodeGen/CodeGenTypes.cpp            |  9 ++++--
 .../BasicFeatures/MatrixElementTypeCast.hlsl  | 28 ++++++++--------
 .../MatrixExplicitTruncation.hlsl             | 32 +++++++++----------
 .../MatrixImplicitTruncation.hlsl             | 30 ++++++++---------
 .../MatrixSingleSubscriptConstSwizzle.hlsl    |  4 +--
 .../MatrixSingleSubscriptDynamicSwizzle.hlsl  |  6 ++--
 .../MatrixSingleSubscriptGetter.hlsl          | 20 ++++++------
 .../MatrixSingleSubscriptSetter.hlsl          |  2 +-
 .../BasicFeatures/MatrixSplat.hlsl            | 20 ++++++------
 clang/test/CodeGenHLSL/BoolMatrix.hlsl        | 20 ++++++------
 clang/test/CodeGenHLSL/basic_types.hlsl       | 32 +++++++++----------
 ...member-one-based-accessor-scalar-load.hlsl | 32 +++++++++----------
 .../matrix-member-one-based-swizzle-load.hlsl | 16 +++++-----
 ...ember-zero-based-accessor-scalar-load.hlsl | 32 +++++++++----------
 ...matrix-member-zero-based-swizzle-load.hlsl | 16 +++++-----
 16 files changed, 168 insertions(+), 151 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 42961f119370e..e9d7c4df32a2c 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -200,8 +200,14 @@ RawAddress CodeGenFunction::CreateMemTemp(QualType Ty, CharUnits Align,
 
   if (Ty->isConstantMatrixType()) {
     auto *ArrayTy = cast<llvm::ArrayType>(Result.getElementType());
-    auto *VectorTy = llvm::FixedVectorType::get(ArrayTy->getElementType(),
-                                                ArrayTy->getNumElements());
+    auto *ArrayElementTy = ArrayTy->getElementType();
+    auto ArrayElements = ArrayTy->getNumElements();
+    if (getContext().getLangOpts().HLSL) {
+      auto *VectorTy = cast<llvm::FixedVectorType>(ArrayElementTy);
+      ArrayElementTy = VectorTy->getElementType();
+      ArrayElements *= VectorTy->getNumElements();
+    }
+    auto *VectorTy = llvm::FixedVectorType::get(ArrayElementTy, ArrayElements);
 
     Result = Address(Result.getPointer(), VectorTy, Result.getAlignment(),
                      KnownNonNull);
@@ -2279,8 +2285,14 @@ static RawAddress MaybeConvertMatrixAddress(RawAddress Addr,
                                             bool IsVector = true) {
   auto *ArrayTy = dyn_cast<llvm::ArrayType>(Addr.getElementType());
   if (ArrayTy && IsVector) {
-    auto *VectorTy = llvm::FixedVectorType::get(ArrayTy->getElementType(),
-                                                ArrayTy->getNumElements());
+    auto ArrayElements = ArrayTy->getNumElements();
+    auto *ArrayElementTy = ArrayTy->getElementType();
+    if (CGF.getContext().getLangOpts().HLSL) {
+      auto *VectorTy = cast<llvm::FixedVectorType>(ArrayElementTy);
+      ArrayElementTy = VectorTy->getElementType();
+      ArrayElements *= VectorTy->getNumElements();
+    }
+    auto *VectorTy = llvm::FixedVectorType::get(ArrayElementTy, ArrayElements);
 
     return Addr.withElementType(VectorTy);
   }
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 0e1131d586433..fd7a8929a9be9 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -105,8 +105,13 @@ llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T) {
     const Type *Ty = Context.getCanonicalType(T).getTypePtr();
     const ConstantMatrixType *MT = cast<ConstantMatrixType>(Ty);
     llvm::Type *IRElemTy = ConvertType(MT->getElementType());
-    if (Context.getLangOpts().HLSL && T->isConstantMatrixBoolType())
-      IRElemTy = ConvertTypeForMem(Context.BoolTy);
+    if (Context.getLangOpts().HLSL) {
+      if (T->isConstantMatrixBoolType())
+        IRElemTy = ConvertTypeForMem(Context.BoolTy);
+      llvm::Type *VecTy =
+          llvm::FixedVectorType::get(IRElemTy, MT->getNumColumns());
+      return llvm::ArrayType::get(VecTy, MT->getNumRows());
+    }
     return llvm::ArrayType::get(IRElemTy, MT->getNumElementsFlattened());
   }
 
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
index 3bd7636212862..f48edc19b86f7 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
@@ -5,8 +5,8 @@
 // CHECK-LABEL: define hidden noundef <6 x i32> @_Z22elementwise_type_cast0u11matrix_typeILm3ELm2EfE(
 // CHECK-SAME: <6 x float> noundef nofpclass(nan inf) [[F32:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca [6 x float], align 4
-// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca [3 x <2 x float>], align 4
+// CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
 // CHECK-NEXT:    store <6 x float> [[F32]], ptr [[F32_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x float>, ptr [[F32_ADDR]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi <6 x float> [[TMP0]] to <6 x i32>
@@ -22,8 +22,8 @@ int3x2 elementwise_type_cast0(float3x2 f32) {
 // CHECK-LABEL: define hidden noundef <6 x i32> @_Z22elementwise_type_cast1u11matrix_typeILm3ELm2EsE(
 // CHECK-SAME: <6 x i16> noundef [[I16_32:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I16_32_ADDR:%.*]] = alloca [6 x i16], align 2
-// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[I16_32_ADDR:%.*]] = alloca [3 x <2 x i16>], align 2
+// CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
 // CHECK-NEXT:    store <6 x i16> [[I16_32]], ptr [[I16_32_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i16>, ptr [[I16_32_ADDR]], align 2
 // CHECK-NEXT:    [[CONV:%.*]] = sext <6 x i16> [[TMP0]] to <6 x i32>
@@ -39,8 +39,8 @@ int3x2 elementwise_type_cast1(int16_t3x2 i16_32) {
 // CHECK-LABEL: define hidden noundef <6 x i32> @_Z22elementwise_type_cast2u11matrix_typeILm3ELm2ElE(
 // CHECK-SAME: <6 x i64> noundef [[I64_32:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I64_32_ADDR:%.*]] = alloca [6 x i64], align 8
-// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[I64_32_ADDR:%.*]] = alloca [3 x <2 x i64>], align 8
+// CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
 // CHECK-NEXT:    store <6 x i64> [[I64_32]], ptr [[I64_32_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i64>, ptr [[I64_32_ADDR]], align 8
 // CHECK-NEXT:    [[CONV:%.*]] = trunc <6 x i64> [[TMP0]] to <6 x i32>
@@ -56,8 +56,8 @@ int3x2 elementwise_type_cast2(int64_t3x2 i64_32) {
 // CHECK-LABEL: define hidden noundef <6 x i16> @_Z22elementwise_type_cast3u11matrix_typeILm2ELm3EDhE(
 // CHECK-SAME: <6 x half> noundef nofpclass(nan inf) [[H23:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[H23_ADDR:%.*]] = alloca [6 x half], align 2
-// CHECK-NEXT:    [[I23:%.*]] = alloca [6 x i16], align 2
+// CHECK-NEXT:    [[H23_ADDR:%.*]] = alloca [2 x <3 x half>], align 2
+// CHECK-NEXT:    [[I23:%.*]] = alloca [2 x <3 x i16>], align 2
 // CHECK-NEXT:    store <6 x half> [[H23]], ptr [[H23_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x half>, ptr [[H23_ADDR]], align 2
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi <6 x half> [[TMP0]] to <6 x i16>
@@ -73,8 +73,8 @@ int16_t2x3 elementwise_type_cast3(half2x3 h23) {
 // CHECK-LABEL: define hidden noundef <6 x i32> @_Z22elementwise_type_cast4u11matrix_typeILm3ELm2EdE(
 // CHECK-SAME: <6 x double> noundef nofpclass(nan inf) [[D32:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[D32_ADDR:%.*]] = alloca [6 x double], align 8
-// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[D32_ADDR:%.*]] = alloca [3 x <2 x double>], align 8
+// CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
 // CHECK-NEXT:    store <6 x double> [[D32]], ptr [[D32_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x double>, ptr [[D32_ADDR]], align 8
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi <6 x double> [[TMP0]] to <6 x i32>
@@ -91,7 +91,7 @@ int3x2 elementwise_type_cast4(double3x2 d32) {
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A:%.*]] = alloca [2 x [1 x i32]], align 4
-// CHECK-NEXT:    [[B:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[B:%.*]] = alloca [2 x <1 x i32>], align 4
 // CHECK-NEXT:    [[AGG_TEMP:%.*]] = alloca [2 x [1 x i32]], align 4
 // CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <2 x i32>, align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @__const._Z5call2v.A, i32 8, i1 false)
@@ -120,7 +120,7 @@ struct S {
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 1
-// CHECK-NEXT:    [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[A:%.*]] = alloca [2 x <1 x i32>], align 4
 // CHECK-NEXT:    [[AGG_TEMP:%.*]] = alloca [[STRUCT_S]], align 1
 // CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <2 x i32>, align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[S]], ptr align 1 @__const._Z5call3v.s, i32 8, i1 false)
@@ -155,7 +155,7 @@ struct Derived : BFields {
 // CHECK-LABEL: define hidden void @_Z5call47Derived(
 // CHECK-SAME: ptr noundef byval([[STRUCT_DERIVED:%.*]]) align 1 [[D:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    [[A:%.*]] = alloca [2 x <2 x i32>], align 4
 // CHECK-NEXT:    [[AGG_TEMP:%.*]] = alloca [[STRUCT_DERIVED]], align 1
 // CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <4 x i32>, align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_TEMP]], ptr align 1 [[D]], i32 19, i1 false)
@@ -189,7 +189,7 @@ void call4(Derived D) {
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[V_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[M:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [2 x <2 x float>], align 4
 // CHECK-NEXT:    [[HLSL_EWCAST_SRC:%.*]] = alloca <4 x float>, align 16
 // CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <4 x float>, align 4
 // CHECK-NEXT:    store <4 x float> [[V]], ptr [[V_ADDR]], align 16
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl
index f3c4bc496d5a4..56f816806d63f 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixExplicitTruncation.hlsl
@@ -4,8 +4,8 @@
 // CHECK-LABEL: define hidden noundef <12 x i32> @_Z10trunc_castu11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I34:%.*]] = alloca [12 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I34:%.*]] = alloca [3 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -21,8 +21,8 @@
 // CHECK-LABEL: define hidden noundef <12 x i32> @_Z11trunc_cast0u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I43:%.*]] = alloca [12 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I43:%.*]] = alloca [4 x <3 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 12, i32 13, i32 14>
@@ -38,8 +38,8 @@
 // CHECK-LABEL: define hidden noundef <9 x i32> @_Z11trunc_cast1u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I33:%.*]] = alloca [9 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I33:%.*]] = alloca [3 x <3 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10>
@@ -55,8 +55,8 @@
 // CHECK-LABEL: define hidden noundef <6 x i32> @_Z11trunc_cast2u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9>
@@ -72,8 +72,8 @@
 // CHECK-LABEL: define hidden noundef <6 x i32> @_Z11trunc_cast3u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I23:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I23:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6>
@@ -89,8 +89,8 @@
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z11trunc_cast4u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I22:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I22:%.*]] = alloca [2 x <2 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -106,8 +106,8 @@
 // CHECK-LABEL: define hidden noundef <2 x i32> @_Z11trunc_cast5u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I21:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I21:%.*]] = alloca [2 x <1 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <2 x i32> <i32 0, i32 4>
@@ -123,7 +123,7 @@
 // CHECK-LABEL: define hidden noundef i32 @_Z11trunc_cast6u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    [[I1:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
@@ -140,7 +140,7 @@
 // CHECK-LABEL: define hidden noundef i32 @_Z16trunc_multi_castu11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    [[I1:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl
index e621f68623bd1..b58f567eb51d3 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixImplicitTruncation.hlsl
@@ -4,8 +4,8 @@
 // CHECK-LABEL: define hidden noundef <12 x i32> @_Z10trunc_castu11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I34:%.*]] = alloca [12 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I34:%.*]] = alloca [3 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -21,8 +21,8 @@
 // CHECK-LABEL: define hidden noundef <12 x i32> @_Z11trunc_cast0u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I43:%.*]] = alloca [12 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I43:%.*]] = alloca [4 x <3 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 12, i32 13, i32 14>
@@ -38,8 +38,8 @@
 // CHECK-LABEL: define hidden noundef <9 x i32> @_Z11trunc_cast1u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I33:%.*]] = alloca [9 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I33:%.*]] = alloca [3 x <3 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10>
@@ -55,8 +55,8 @@
 // CHECK-LABEL: define hidden noundef <6 x i32> @_Z11trunc_cast2u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I32:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9>
@@ -72,8 +72,8 @@
 // CHECK-LABEL: define hidden noundef <6 x i32> @_Z11trunc_cast3u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I23:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I23:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6>
@@ -89,8 +89,8 @@
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z11trunc_cast4u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I22:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I22:%.*]] = alloca [2 x <2 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -106,8 +106,8 @@
 // CHECK-LABEL: define hidden noundef <2 x i32> @_Z11trunc_cast5u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
-// CHECK-NEXT:    [[I21:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
+// CHECK-NEXT:    [[I21:%.*]] = alloca [2 x <1 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <2 x i32> <i32 0, i32 4>
@@ -123,7 +123,7 @@
 // CHECK-LABEL: define hidden noundef i32 @_Z11trunc_cast6u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[I44:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[I44_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    [[I1:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store <16 x i32> [[I44]], ptr [[I44_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[I44_ADDR]], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptConstSwizzle.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptConstSwizzle.hlsl
index 02885d153697a..2b950d8a51a38 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptConstSwizzle.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptConstSwizzle.hlsl
@@ -108,7 +108,7 @@ void setVectorOnMatrixSwizzle(out int2x3 M, int3 V) {
 // CHECK-SAME: ptr noalias noundef nonnull align 4 dereferenceable(24) [[M:%.*]], <6 x i32> noundef [[N:%.*]], i32 noundef [[MINDEX:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    [[MINDEX_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    store <6 x i32> [[N]], ptr [[N_ADDR]], align 4
@@ -139,7 +139,7 @@ void setMatrixFromMatrix(out int2x3 M, int2x3 N, int MIndex) {
 // CHECK-SAME: ptr noalias noundef nonnull align 4 dereferenceable(24) [[M:%.*]], <6 x i32> noundef [[N:%.*]], i32 noundef [[NINDEX:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    [[NINDEX_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    store <6 x i32> [[N]], ptr [[N_ADDR]], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptDynamicSwizzle.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptDynamicSwizzle.hlsl
index 97ce63f545cff..7190b6e1148a5 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptDynamicSwizzle.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptDynamicSwizzle.hlsl
@@ -38,7 +38,7 @@ void setMatrix(out float4x4 M, int index, float4 V) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z9getMatrixu11matrix_typeILm4ELm4EfEi(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[M:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store <16 x float> [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[INDEX]], ptr [[INDEX_ADDR]], align 4
@@ -62,7 +62,7 @@ float3 getMatrix(float4x4 M, int index) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z9getMatrixu11matrix_typeILm3ELm3EfEi(
 // CHECK-SAME: <9 x float> noundef nofpclass(nan inf) [[M:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [9 x float], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [3 x <3 x float>], align 4
 // CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store <9 x float> [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[INDEX]], ptr [[INDEX_ADDR]], align 4
@@ -115,7 +115,7 @@ int3 getMatrixSwizzle2x3(out int2x3 M, int index) {
 // CHECK-SAME: ptr noalias noundef nonnull align 4 dereferenceable(24) [[M:%.*]], <6 x i32> noundef [[N:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    store <6 x i32> [[N]], ptr [[N_ADDR]], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptGetter.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptGetter.hlsl
index df724d217fe6b..efa9381b515af 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptGetter.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptGetter.hlsl
@@ -4,7 +4,7 @@
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z24getFloatVecMatrixDynamicu11matrix_typeILm4ELm4EfEi(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[M:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store <16 x float> [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[INDEX]], ptr [[INDEX_ADDR]], align 4
@@ -31,7 +31,7 @@ float4 getFloatVecMatrixDynamic(float4x4 M, int index) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z27getFloatScalarMatrixDynamicu11matrix_typeILm2ELm1EfEi(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[M:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [2 x float], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [2 x <1 x float>], align 4
 // CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store <2 x float> [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[INDEX]], ptr [[INDEX_ADDR]], align 4
@@ -50,7 +50,7 @@ float getFloatScalarMatrixDynamic(float2x1 M, int index) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z28getFloatScalarMatrixConstantu11matrix_typeILm2ELm1EfE(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[M:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [2 x float], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [2 x <1 x float>], align 4
 // CHECK-NEXT:    store <2 x float> [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[MATRIX_ELEM:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
@@ -65,7 +65,7 @@ float getFloatScalarMatrixConstant(float2x1 M) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z29getFloatScalarMatrixConstant2u11matrix_typeILm2ELm1EfE(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[M:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [2 x float], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [2 x <1 x float>], align 4
 // CHECK-NEXT:    store <2 x float> [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[MATRIX_ELEM:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
@@ -80,7 +80,7 @@ float getFloatScalarMatrixConstant2(float2x1 M) {
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z19getIntMatrixDynamicu11matrix_typeILm4ELm4EiEi(
 // CHECK-SAME: <16 x i32> noundef [[M:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store <16 x i32> [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    store i32 [[INDEX]], ptr [[INDEX_ADDR]], align 4
@@ -107,7 +107,7 @@ int4 getIntMatrixDynamic(int4x4 M, int index) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z22AddFloatMatrixConstantu11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[M:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[MATRIX_ELEM:%.*]] = extractelement <16 x float> [[TMP0]], i32 0
@@ -157,7 +157,7 @@ float4 AddFloatMatrixConstant(float4x4 M) {
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z20AddIntMatrixConstantu11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[M:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[MATRIX_ELEM:%.*]] = extractelement <16 x i32> [[TMP0]], i32 0
@@ -207,7 +207,7 @@ int4 AddIntMatrixConstant(int4x4 M) {
 // CHECK-LABEL: define hidden noundef <3 x i1> @_Z23getBoolVecMatrixDynamicu11matrix_typeILm2ELm3EbEi(
 // CHECK-SAME: <6 x i1> noundef [[M:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = zext <6 x i1> [[M]] to <6 x i32>
 // CHECK-NEXT:    store <6 x i32> [[TMP0]], ptr [[M_ADDR]], align 4
@@ -233,7 +233,7 @@ bool3 getBoolVecMatrixDynamic(bool2x3 M, int index) {
 // CHECK-LABEL: define hidden noundef <4 x i1> @_Z24getBoolVecMatrixConstantu11matrix_typeILm4ELm4EbE(
 // CHECK-SAME: <16 x i1> noundef [[M:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = zext <16 x i1> [[M]] to <16 x i32>
 // CHECK-NEXT:    store <16 x i32> [[TMP0]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr [[M_ADDR]], align 4
@@ -255,7 +255,7 @@ bool4 getBoolVecMatrixConstant(bool4x4 M) {
 // CHECK-LABEL: define hidden noundef i1 @_Z27getBoolScalarMatrixConstantu11matrix_typeILm3ELm1EbE(
 // CHECK-SAME: <3 x i1> noundef [[M:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [3 x i32], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [3 x <1 x i32>], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = zext <3 x i1> [[M]] to <3 x i32>
 // CHECK-NEXT:    store <3 x i32> [[TMP0]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[M_ADDR]], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptSetter.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptSetter.hlsl
index 15861b3211606..ec362aa269986 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptSetter.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSingleSubscriptSetter.hlsl
@@ -127,7 +127,7 @@ void setBoolMatrixScalar(out bool2x1 M, int index, bool S) {
 // CHECK-SAME: ptr noalias noundef nonnull align 4 dereferenceable(64) [[M:%.*]], <16 x i32> noundef [[N:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store ptr [[M]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    store <16 x i32> [[N]], ptr [[N_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[N_ADDR]], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSplat.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSplat.hlsl
index 9b9538e0afdd1..768c1b8e02bea 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixSplat.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixSplat.hlsl
@@ -4,7 +4,7 @@
 // CHECK-LABEL: define hidden void @_Z13ConstantSplatv(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> splat (i32 1), ptr [[M]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -15,7 +15,7 @@ void ConstantSplat() {
 // CHECK-LABEL: define hidden void @_Z18ConstantFloatSplatv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [2 x <2 x float>], align 4
 // CHECK-NEXT:    store <4 x float> splat (float 3.250000e+00), ptr [[M]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -26,7 +26,7 @@ void ConstantFloatSplat() {
 // CHECK-LABEL: define hidden void @_Z21ConstantTrueBoolSplatv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M:%.*]] = alloca [9 x i32], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [3 x <3 x i32>], align 4
 // CHECK-NEXT:    store <9 x i32> splat (i32 1), ptr [[M]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -37,7 +37,7 @@ void ConstantTrueBoolSplat() {
 // CHECK-LABEL: define hidden void @_Z22ConstantFalseBoolSplatv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M:%.*]] = alloca [9 x i32], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [3 x <3 x i32>], align 4
 // CHECK-NEXT:    store <9 x i32> zeroinitializer, ptr [[M]], align 4
 // CHECK-NEXT:    ret void
 //
@@ -49,7 +49,7 @@ void ConstantFalseBoolSplat() {
 // CHECK-SAME: float noundef nofpclass(nan inf) [[VALUE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[M:%.*]] = alloca [9 x float], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [3 x <3 x float>], align 4
 // CHECK-NEXT:    store float [[VALUE]], ptr [[VALUE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[VALUE_ADDR]], align 4
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <9 x float> poison, float [[TMP0]], i64 0
@@ -65,7 +65,7 @@ void DynamicSplat(float Value) {
 // CHECK-SAME: i1 noundef [[VALUE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[M:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[VALUE]] to i32
 // CHECK-NEXT:    store i32 [[STOREDV]], ptr [[VALUE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VALUE_ADDR]], align 4
@@ -84,7 +84,7 @@ void DynamicBoolSplat(bool Value) {
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[VALUE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    [[M:%.*]] = alloca [9 x float], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [3 x <3 x float>], align 4
 // CHECK-NEXT:    store <4 x float> [[VALUE]], ptr [[VALUE_ADDR]], align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[VALUE_ADDR]], align 16
 // CHECK-NEXT:    [[CAST_VTRUNC:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
@@ -101,7 +101,7 @@ void CastThenSplat(float4 Value) {
 // CHECK-SAME: <3 x i32> noundef [[VALUE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca <3 x i32>, align 16
-// CHECK-NEXT:    [[M:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [2 x <2 x i32>], align 4
 // CHECK-NEXT:    store <3 x i32> [[VALUE]], ptr [[VALUE_ADDR]], align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr [[VALUE_ADDR]], align 16
 // CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne <3 x i32> [[TMP0]], zeroinitializer
@@ -120,7 +120,7 @@ void ExplicitIntToBoolCastThenSplat(int3 Value) {
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[VALUE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT:    [[M:%.*]] = alloca [6 x i32], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    store <2 x float> [[VALUE]], ptr [[VALUE_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[VALUE_ADDR]], align 8
 // CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une <2 x float> [[TMP0]], zeroinitializer
@@ -139,7 +139,7 @@ void ExplicitFloatToBoolCastThenSplat(float2 Value) {
 // CHECK-SAME: i1 noundef [[VALUE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[M:%.*]] = alloca [6 x float], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [3 x <2 x float>], align 4
 // CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[VALUE]] to i32
 // CHECK-NEXT:    store i32 [[STOREDV]], ptr [[VALUE_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VALUE_ADDR]], align 4
diff --git a/clang/test/CodeGenHLSL/BoolMatrix.hlsl b/clang/test/CodeGenHLSL/BoolMatrix.hlsl
index 824b9656e6848..d6ac50c978405 100644
--- a/clang/test/CodeGenHLSL/BoolMatrix.hlsl
+++ b/clang/test/CodeGenHLSL/BoolMatrix.hlsl
@@ -11,7 +11,7 @@ struct S {
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i1, align 4
-// CHECK-NEXT:    [[B:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    [[B:%.*]] = alloca [2 x <2 x i32>], align 4
 // CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[B]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[B]], align 4
 // CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
@@ -29,7 +29,7 @@ bool fn1() {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca <4 x i1>, align 4
 // CHECK-NEXT:    [[V_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[A:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    [[A:%.*]] = alloca [2 x <2 x i32>], align 4
 // CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[V]] to i32
 // CHECK-NEXT:    store i32 [[STOREDV]], ptr [[V_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
@@ -77,11 +77,11 @@ bool fn3() {
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i1, align 4
-// CHECK-NEXT:    [[ARR:%.*]] = alloca [2 x [4 x i32]], align 4
+// CHECK-NEXT:    [[ARR:%.*]] = alloca [2 x [2 x <2 x i32>]], align 4
 // CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[ARR]], align 4
-// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [4 x i32], ptr [[ARR]], i32 1
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[ARR]], i32 1
 // CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[ARRAYINIT_ELEMENT]], align 4
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [4 x i32]], ptr [[ARR]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [2 x <2 x i32>]], ptr [[ARR]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[MATRIXEXT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
 // CHECK-NEXT:    store i32 [[MATRIXEXT]], ptr [[RETVAL]], align 4
@@ -96,7 +96,7 @@ bool fn4() {
 // CHECK-LABEL: define hidden void @_Z3fn5v(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[M:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT:    [[M:%.*]] = alloca [2 x <2 x i32>], align 4
 // CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[M]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <4 x i32>, ptr [[M]], i32 0, i32 3
 // CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
@@ -134,11 +134,11 @@ void fn6() {
 // CHECK-LABEL: define hidden void @_Z3fn7v(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[ARR:%.*]] = alloca [2 x [4 x i32]], align 4
+// CHECK-NEXT:    [[ARR:%.*]] = alloca [2 x [2 x <2 x i32>]], align 4
 // CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[ARR]], align 4
-// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [4 x i32], ptr [[ARR]], i32 1
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[ARR]], i32 1
 // CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[ARRAYINIT_ELEMENT]], align 4
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [4 x i32]], ptr [[ARR]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x [2 x <2 x i32>]], ptr [[ARR]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <4 x i32>, ptr [[ARRAYIDX]], i32 0, i32 1
 // CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
 // CHECK-NEXT:    ret void
@@ -152,7 +152,7 @@ void fn7() {
 // CHECK-SAME: <16 x i1> noundef [[M:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca <16 x i1>, align 4
-// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = zext <16 x i1> [[M]] to <16 x i32>
 // CHECK-NEXT:    store <16 x i32> [[TMP0]], ptr [[M_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr [[M_ADDR]], align 4
diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl
index 677a9a8f5d1de..0aaf7a1b77797 100644
--- a/clang/test/CodeGenHLSL/basic_types.hlsl
+++ b/clang/test/CodeGenHLSL/basic_types.hlsl
@@ -38,22 +38,22 @@
 // CHECK: @double2_Val = external hidden addrspace(2) global <2 x double>, align 16
 // CHECK: @double3_Val = external hidden addrspace(2) global <3 x double>, align 32
 // CHECK: @double4_Val = external hidden addrspace(2) global <4 x double>, align 32
-// CHECK: @bool1x1_Val = external hidden addrspace(2) global [1 x i32], align 4
-// CHECK: @bool1x2_Val = external hidden addrspace(2) global [2 x i32], align 4
-// CHECK: @bool1x3_Val = external hidden addrspace(2) global [3 x i32], align 4
-// CHECK: @bool1x4_Val = external hidden addrspace(2) global [4 x i32], align 4
-// CHECK: @bool2x1_Val = external hidden addrspace(2) global [2 x i32], align 4
-// CHECK: @bool2x2_Val = external hidden addrspace(2) global [4 x i32], align 4
-// CHECK: @bool2x3_Val = external hidden addrspace(2) global [6 x i32], align 4
-// CHECK: @bool2x4_Val = external hidden addrspace(2) global [8 x i32], align 4
-// CHECK: @bool3x1_Val = external hidden addrspace(2) global [3 x i32], align 4
-// CHECK: @bool3x2_Val = external hidden addrspace(2) global [6 x i32], align 4
-// CHECK: @bool3x3_Val = external hidden addrspace(2) global [9 x i32], align 4
-// CHECK: @bool3x4_Val = external hidden addrspace(2) global [12 x i32], align 4
-// CHECK: @bool4x1_Val = external hidden addrspace(2) global [4 x i32], align 4
-// CHECK: @bool4x2_Val = external hidden addrspace(2) global [8 x i32], align 4
-// CHECK: @bool4x3_Val = external hidden addrspace(2) global [12 x i32], align 4
-// CHECK: @bool4x4_Val = external hidden addrspace(2) global [16 x i32], align 4
+// CHECK: @bool1x1_Val = external hidden addrspace(2) global [1 x <1 x i32>], align 4
+// CHECK: @bool1x2_Val = external hidden addrspace(2) global [1 x <2 x i32>], align 4
+// CHECK: @bool1x3_Val = external hidden addrspace(2) global [1 x <3 x i32>], align 4
+// CHECK: @bool1x4_Val = external hidden addrspace(2) global [1 x <4 x i32>], align 4
+// CHECK: @bool2x1_Val = external hidden addrspace(2) global [2 x <1 x i32>], align 4
+// CHECK: @bool2x2_Val = external hidden addrspace(2) global [2 x <2 x i32>], align 4
+// CHECK: @bool2x3_Val = external hidden addrspace(2) global [2 x <3 x i32>], align 4
+// CHECK: @bool2x4_Val = external hidden addrspace(2) global [2 x <4 x i32>], align 4
+// CHECK: @bool3x1_Val = external hidden addrspace(2) global [3 x <1 x i32>], align 4
+// CHECK: @bool3x2_Val = external hidden addrspace(2) global [3 x <2 x i32>], align 4
+// CHECK: @bool3x3_Val = external hidden addrspace(2) global [3 x <3 x i32>], align 4
+// CHECK: @bool3x4_Val = external hidden addrspace(2) global [3 x <4 x i32>], align 4
+// CHECK: @bool4x1_Val = external hidden addrspace(2) global [4 x <1 x i32>], align 4
+// CHECK: @bool4x2_Val = external hidden addrspace(2) global [4 x <2 x i32>], align 4
+// CHECK: @bool4x3_Val = external hidden addrspace(2) global [4 x <3 x i32>], align 4
+// CHECK: @bool4x4_Val = external hidden addrspace(2) global [4 x <4 x i32>], align 4
 
 #ifdef NAMESPACED
 #define TYPE_DECL(T)  hlsl::T T##_Val
diff --git a/clang/test/CodeGenHLSL/matrix-member-one-based-accessor-scalar-load.hlsl b/clang/test/CodeGenHLSL/matrix-member-one-based-accessor-scalar-load.hlsl
index 1d3d4d17e0c8a..bedb9fdbe11c8 100644
--- a/clang/test/CodeGenHLSL/matrix-member-one-based-accessor-scalar-load.hlsl
+++ b/clang/test/CodeGenHLSL/matrix-member-one-based-accessor-scalar-load.hlsl
@@ -7,7 +7,7 @@
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return11u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> zeroinitializer
@@ -21,7 +21,7 @@ int Return11(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return12u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 1>
@@ -35,7 +35,7 @@ int Return12(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return13u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 2>
@@ -49,7 +49,7 @@ int Return13(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return14u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 3>
@@ -63,7 +63,7 @@ int Return14(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return21u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 4>
@@ -77,7 +77,7 @@ int Return21(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return22u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 5>
@@ -91,7 +91,7 @@ int Return22(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return23u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 6>
@@ -105,7 +105,7 @@ int Return23(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return24u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 7>
@@ -119,7 +119,7 @@ int Return24(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return31u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 8>
@@ -133,7 +133,7 @@ int Return31(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return32u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 9>
@@ -147,7 +147,7 @@ int Return32(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return33u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 10>
@@ -161,7 +161,7 @@ int Return33(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return34u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 11>
@@ -175,7 +175,7 @@ int Return34(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return41u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 12>
@@ -189,7 +189,7 @@ int Return41(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return42u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 13>
@@ -203,7 +203,7 @@ int Return42(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return43u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 14>
@@ -217,7 +217,7 @@ int Return43(int4x4 A) {
 // CHECK-LABEL: define hidden noundef i32 @_Z8Return44u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <1 x i32> <i32 15>
diff --git a/clang/test/CodeGenHLSL/matrix-member-one-based-swizzle-load.hlsl b/clang/test/CodeGenHLSL/matrix-member-one-based-swizzle-load.hlsl
index 31a56811473af..47737aaab0390 100644
--- a/clang/test/CodeGenHLSL/matrix-member-one-based-swizzle-load.hlsl
+++ b/clang/test/CodeGenHLSL/matrix-member-one-based-swizzle-load.hlsl
@@ -6,7 +6,7 @@
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z17ReturnOnesSwizzleu11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -19,7 +19,7 @@ int4 ReturnOnesSwizzle(int4x4 A) {
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z18ReturnOnesSwizzle2u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -32,7 +32,7 @@ int4 ReturnOnesSwizzle2(int4x4 A) {
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z17ReturnTwosSwizzleu11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -45,7 +45,7 @@ int4 ReturnTwosSwizzle(int4x4 A) {
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z18ReturnTwosSwizzle2u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -58,7 +58,7 @@ int4 ReturnTwosSwizzle2(int4x4 A) {
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z19ReturnThreesSwizzleu11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
@@ -71,7 +71,7 @@ int4 ReturnThreesSwizzle(int4x4 A) {
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z20ReturnThreesSwizzle2u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
@@ -84,7 +84,7 @@ int4 ReturnThreesSwizzle2(int4x4 A) {
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z18ReturnFoursSwizzleu11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
@@ -97,7 +97,7 @@ int4 ReturnFoursSwizzle(int4x4 A) {
 // CHECK-LABEL: define hidden noundef <4 x i32> @_Z19ReturnFoursSwizzle2u11matrix_typeILm4ELm4EiE(
 // CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x i32], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x i32>], align 4
 // CHECK-NEXT:    store <16 x i32> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
diff --git a/clang/test/CodeGenHLSL/matrix-member-zero-based-accessor-scalar-load.hlsl b/clang/test/CodeGenHLSL/matrix-member-zero-based-accessor-scalar-load.hlsl
index f99bbb2bf2b02..8626e2d0d68b5 100644
--- a/clang/test/CodeGenHLSL/matrix-member-zero-based-accessor-scalar-load.hlsl
+++ b/clang/test/CodeGenHLSL/matrix-member-zero-based-accessor-scalar-load.hlsl
@@ -7,7 +7,7 @@
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return00u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> zeroinitializer
@@ -21,7 +21,7 @@ float Return00(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return01u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 1>
@@ -35,7 +35,7 @@ float Return01(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return02u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 2>
@@ -49,7 +49,7 @@ float Return02(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return03u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 3>
@@ -63,7 +63,7 @@ float Return03(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return10u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 4>
@@ -77,7 +77,7 @@ float Return10(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return11u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 5>
@@ -91,7 +91,7 @@ float Return11(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return12u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 6>
@@ -105,7 +105,7 @@ float Return12(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return13u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 7>
@@ -119,7 +119,7 @@ float Return13(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return20u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 8>
@@ -133,7 +133,7 @@ float Return20(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return21u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 9>
@@ -147,7 +147,7 @@ float Return21(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return22u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 10>
@@ -161,7 +161,7 @@ float Return22(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return23u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 11>
@@ -175,7 +175,7 @@ float Return23(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return30u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 12>
@@ -189,7 +189,7 @@ float Return30(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return31u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 13>
@@ -203,7 +203,7 @@ float Return31(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return32u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 14>
@@ -217,7 +217,7 @@ float Return32(float4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z8Return33u11matrix_typeILm4ELm4EfE(
 // CHECK-SAME: <16 x float> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x float], align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4
 // CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <1 x i32> <i32 15>
diff --git a/clang/test/CodeGenHLSL/matrix-member-zero-based-swizzle-load.hlsl b/clang/test/CodeGenHLSL/matrix-member-zero-based-swizzle-load.hlsl
index 6a58094f59bc2..dca2e6132de60 100644
--- a/clang/test/CodeGenHLSL/matrix-member-zero-based-swizzle-load.hlsl
+++ b/clang/test/CodeGenHLSL/matrix-member-zero-based-swizzle-load.hlsl
@@ -6,7 +6,7 @@
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z18ReturnZerosSwizzleu11matrix_typeILm4ELm4EDhE(
 // CHECK-SAME: <16 x half> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x half], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x half>], align 2
 // CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -19,7 +19,7 @@ half4 ReturnZerosSwizzle(half4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z19ReturnZerosSwizzle2u11matrix_typeILm4ELm4EDhE(
 // CHECK-SAME: <16 x half> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x half], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x half>], align 2
 // CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -32,7 +32,7 @@ half4 ReturnZerosSwizzle2(half4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z17ReturnOnesSwizzleu11matrix_typeILm4ELm4EDhE(
 // CHECK-SAME: <16 x half> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x half], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x half>], align 2
 // CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -45,7 +45,7 @@ half4 ReturnOnesSwizzle(half4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z18ReturnOnesSwizzle2u11matrix_typeILm4ELm4EDhE(
 // CHECK-SAME: <16 x half> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x half], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x half>], align 2
 // CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -58,7 +58,7 @@ half4 ReturnOnesSwizzle2(half4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z17ReturnTwosSwizzleu11matrix_typeILm4ELm4EDhE(
 // CHECK-SAME: <16 x half> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x half], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x half>], align 2
 // CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
@@ -71,7 +71,7 @@ half4 ReturnTwosSwizzle(half4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z18ReturnTwosSwizzle2u11matrix_typeILm4ELm4EDhE(
 // CHECK-SAME: <16 x half> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x half], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x half>], align 2
 // CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
@@ -84,7 +84,7 @@ half4 ReturnTwosSwizzle2(half4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z19ReturnThreesSwizzleu11matrix_typeILm4ELm4EDhE(
 // CHECK-SAME: <16 x half> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x half], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x half>], align 2
 // CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
@@ -97,7 +97,7 @@ half4 ReturnThreesSwizzle(half4x4 A) {
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z20ReturnThreesSwizzle2u11matrix_typeILm4ELm4EDhE(
 // CHECK-SAME: <16 x half> noundef nofpclass(nan inf) [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [16 x half], align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca [4 x <4 x half>], align 2
 // CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>

>From e405e03687ba1149a87fce1fe4392347d13097af Mon Sep 17 00:00:00 2001
From: Argyrios Kyrtzidis <kyrtzidis at apple.com>
Date: Fri, 6 Feb 2026 13:27:24 -0800
Subject: [PATCH 06/13]  [llvm/CAS] Add file-based APIs to `OnDiskGraphDB`
 (#179782)

These allow performing optimizations that reduce I/O and disk space
consumption. For example, when applicable, a file can be cloned directly
into the database directory, instead of needing to load it in memory and
then copy its contents into a new file.

These APIs are then used to optimize importing data from an upstream DB
by using file cloning where applicable.
---
 llvm/include/llvm/CAS/BuiltinObjectHasher.h   |   2 +
 llvm/include/llvm/CAS/OnDiskGraphDB.h         |  47 +++-
 llvm/lib/CAS/BuiltinObjectHasher.cpp          |  51 ++++
 llvm/lib/CAS/CMakeLists.txt                   |   1 +
 llvm/lib/CAS/OnDiskGraphDB.cpp                | 252 +++++++++++++++---
 .../unittests/CAS/BuiltinObjectHasherTest.cpp |  49 ++++
 llvm/unittests/CAS/CMakeLists.txt             |   1 +
 llvm/unittests/CAS/OnDiskCommonUtils.h        |  15 ++
 llvm/unittests/CAS/OnDiskGraphDBTest.cpp      | 208 +++++++++++++++
 9 files changed, 595 insertions(+), 31 deletions(-)
 create mode 100644 llvm/lib/CAS/BuiltinObjectHasher.cpp
 create mode 100644 llvm/unittests/CAS/BuiltinObjectHasherTest.cpp

diff --git a/llvm/include/llvm/CAS/BuiltinObjectHasher.h b/llvm/include/llvm/CAS/BuiltinObjectHasher.h
index c9b004216f796..7079e5ec448c8 100644
--- a/llvm/include/llvm/CAS/BuiltinObjectHasher.h
+++ b/llvm/include/llvm/CAS/BuiltinObjectHasher.h
@@ -39,6 +39,8 @@ template <class HasherT> class BuiltinObjectHasher {
     return H.finish();
   }
 
+  static Expected<HashT> hashFile(StringRef FilePath);
+
 private:
   HashT finish() { return Hasher.final(); }
 
diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h
index 64ad83440bc9c..0c57893f465f9 100644
--- a/llvm/include/llvm/CAS/OnDiskGraphDB.h
+++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h
@@ -264,6 +264,19 @@ class OnDiskGraphDB {
   LLVM_ABI_FOR_TEST Error store(ObjectID ID, ArrayRef<ObjectID> Refs,
                                 ArrayRef<char> Data);
 
+  /// Associates the data of a file with a particular object ID. If there is
+  /// already a record for this object the operation is a no-op.
+  ///
+  /// This is more than a convenience variant of \c store(), \c storeFile() can
+  /// perform optimizations that reduce I/O and disk space consumption.
+  ///
+  /// If there are any concurrent modifications to the file, the contents in the
+  /// CAS may be corrupt.
+  ///
+  /// \param ID the object ID to associate the data with.
+  /// \param FilePath the path of the file data.
+  LLVM_ABI_FOR_TEST Error storeFile(ObjectID ID, StringRef FilePath);
+
   /// \returns \p nullopt if the object associated with \p Ref does not exist.
   LLVM_ABI_FOR_TEST Expected<std::optional<ObjectHandle>> load(ObjectID Ref);
 
@@ -315,6 +328,31 @@ class OnDiskGraphDB {
     return make_range(Refs.begin(), Refs.end());
   }
 
+  /// Encapsulates file info for an underlying object node.
+  struct FileBackedData {
+    /// The data of the object node.
+    ArrayRef<char> Data;
+
+    struct FileInfoTy {
+      /// The file path of the object node.
+      std::string FilePath;
+      /// Whether the file of the object leaf node has an extra nul appended at
+      /// the end. If the file is copied the extra nul needs to be removed.
+      bool IsFileNulTerminated;
+    };
+    /// File information for the object, if available.
+    std::optional<FileInfoTy> FileInfo;
+  };
+
+  /// Provides access to the underlying file path, that represents an object
+  /// leaf node, when available.
+  ///
+  /// This enables reducing I/O and disk space consumption, i.e. instead of
+  /// loading the data in memory and then writing it to a file, the client could
+  /// clone the underlying file directly. The client *must not* write to or
+  /// delete the underlying file, the path is provided only for reading/copying.
+  FileBackedData getInternalFileBackedObjectData(ObjectHandle Node) const;
+
   /// \returns Total size of stored objects.
   ///
   /// NOTE: There's a possibility that the returned size is not including a
@@ -398,12 +436,19 @@ class OnDiskGraphDB {
   Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode);
   /// Import only the \param UpstreamNode.
   Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode);
+  Error importUpstreamData(ObjectID PrimaryID, ArrayRef<ObjectID> PrimaryRefs,
+                           ObjectHandle UpstreamNode);
+
+  enum class InternalUpstreamImportKind { Leaf, Leaf0 };
+  /// Private \c storeFile than optimizes internal upstream database imports.
+  Error storeFile(ObjectID ID, StringRef FilePath,
+                  std::optional<InternalUpstreamImportKind> ImportKind);
 
   /// Found the IndexProxy for the hash.
   Expected<IndexProxy> indexHash(ArrayRef<uint8_t> Hash);
 
   /// Get path for creating standalone data file.
-  void getStandalonePath(StringRef FileSuffix, const IndexProxy &I,
+  void getStandalonePath(StringRef FileSuffix, FileOffset IndexOffset,
                          SmallVectorImpl<char> &Path) const;
   /// Create a standalone leaf file.
   Error createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data);
diff --git a/llvm/lib/CAS/BuiltinObjectHasher.cpp b/llvm/lib/CAS/BuiltinObjectHasher.cpp
new file mode 100644
index 0000000000000..756954b0f0808
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinObjectHasher.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/Support/BLAKE3.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+template <class HasherT>
+Expected<typename BuiltinObjectHasher<HasherT>::HashT>
+BuiltinObjectHasher<HasherT>::hashFile(StringRef FilePath) {
+  BuiltinObjectHasher H;
+  H.updateSize(0); // 0 refs
+
+  sys::fs::file_t FD;
+  if (Error E = sys::fs::openNativeFileForRead(FilePath).moveInto(FD))
+    return E;
+
+  sys::fs::file_status Status;
+  std::error_code EC = sys::fs::status(FD, Status);
+  if (EC)
+    return createFileError(FilePath, EC);
+  // FIXME: Do we need to add a hash of the data size? If we remove that we can
+  // avoid needing to read the file size before reading the file contents.
+  H.updateSize(Status.getSize());
+
+  size_t ChunkSize = sys::fs::DefaultReadChunkSize;
+  SmallVector<char, 0> Buffer;
+  Buffer.resize_for_overwrite(ChunkSize);
+  for (;;) {
+    Expected<size_t> ReadBytes =
+        sys::fs::readNativeFile(FD, MutableArrayRef(Buffer.begin(), ChunkSize));
+    if (!ReadBytes)
+      return ReadBytes.takeError();
+    if (*ReadBytes == 0)
+      break;
+    H.Hasher.update(toStringRef(ArrayRef(Buffer).take_front(*ReadBytes)));
+  }
+
+  return H.finish();
+}
+
+// Provide the definition for when using the BLAKE3 hasher.
+template Expected<BuiltinObjectHasher<BLAKE3>::HashT>
+BuiltinObjectHasher<BLAKE3>::hashFile(StringRef FilePath);
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index 605c548ba994f..b17fa84558bab 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMCAS
   ActionCache.cpp
   ActionCaches.cpp
   BuiltinCAS.cpp
+  BuiltinObjectHasher.cpp
   BuiltinUnifiedCASDatabases.cpp
   CASNodeSchema.cpp
   DatabaseFile.cpp
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
index 2aaec22b10701..d2e1966f4540f 100644
--- a/llvm/lib/CAS/OnDiskGraphDB.cpp
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -65,6 +65,7 @@
 #include <atomic>
 #include <mutex>
 #include <optional>
+#include <variant>
 
 #define DEBUG_TYPE "on-disk-cas"
 
@@ -353,6 +354,13 @@ struct DataRecordHandle {
 struct OnDiskContent {
   std::optional<DataRecordHandle> Record;
   std::optional<ArrayRef<char>> Bytes;
+
+  ArrayRef<char> getData() const {
+    if (Bytes)
+      return *Bytes;
+    assert(Record && "Expected record or bytes");
+    return Record->getData();
+  }
 };
 
 /// Data loaded inside the memory from standalone file.
@@ -360,9 +368,12 @@ class StandaloneDataInMemory {
 public:
   OnDiskContent getContent() const;
 
+  OnDiskGraphDB::FileBackedData
+  getInternalFileBackedObjectData(StringRef RootPath) const;
+
   StandaloneDataInMemory(std::unique_ptr<sys::fs::mapped_file_region> Region,
-                         TrieRecord::StorageKind SK)
-      : Region(std::move(Region)), SK(SK) {
+                         TrieRecord::StorageKind SK, FileOffset IndexOffset)
+      : Region(std::move(Region)), SK(SK), IndexOffset(IndexOffset) {
 #ifndef NDEBUG
     bool IsStandalone = false;
     switch (SK) {
@@ -381,6 +392,7 @@ class StandaloneDataInMemory {
 private:
   std::unique_ptr<sys::fs::mapped_file_region> Region;
   TrieRecord::StorageKind SK;
+  FileOffset IndexOffset;
 };
 
 /// Container to lookup loaded standalone objects.
@@ -389,7 +401,8 @@ template <size_t NumShards> class StandaloneDataMap {
 
 public:
   uintptr_t insert(ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
-                   std::unique_ptr<sys::fs::mapped_file_region> Region);
+                   std::unique_ptr<sys::fs::mapped_file_region> Region,
+                   FileOffset IndexOffset);
 
   const StandaloneDataInMemory *lookup(ArrayRef<uint8_t> Hash) const;
   bool count(ArrayRef<uint8_t> Hash) const { return bool(lookup(Hash)); }
@@ -476,12 +489,14 @@ struct OnDiskGraphDB::IndexProxy {
 template <size_t N>
 uintptr_t StandaloneDataMap<N>::insert(
     ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
-    std::unique_ptr<sys::fs::mapped_file_region> Region) {
+    std::unique_ptr<sys::fs::mapped_file_region> Region,
+    FileOffset IndexOffset) {
   auto &S = getShard(Hash);
   std::lock_guard<std::mutex> Lock(S.Mutex);
   auto &V = S.Map[Hash.data()];
   if (!V)
-    V = std::make_unique<StandaloneDataInMemory>(std::move(Region), SK);
+    V = std::make_unique<StandaloneDataInMemory>(std::move(Region), SK,
+                                                 IndexOffset);
   return reinterpret_cast<uintptr_t>(V.get());
 }
 
@@ -949,7 +964,8 @@ Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const {
     case TrieRecord::StorageKind::StandaloneLeaf:
     case TrieRecord::StorageKind::StandaloneLeaf0:
       SmallString<256> Path;
-      getStandalonePath(TrieRecord::getStandaloneFilePrefix(D.SK), *I, Path);
+      getStandalonePath(TrieRecord::getStandaloneFilePrefix(D.SK), I->Offset,
+                        Path);
       // If need to validate the content of the file later, just load the
       // buffer here. Otherwise, just check the existance of the file.
       if (Deep) {
@@ -1201,28 +1217,37 @@ ArrayRef<uint8_t> OnDiskGraphDB::getDigest(const IndexProxy &I) const {
   return I.Hash;
 }
 
-static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool,
-                                          ObjectHandle OH) {
+static std::variant<const StandaloneDataInMemory *, DataRecordHandle>
+getStandaloneDataOrDataRecord(const OnDiskDataAllocator &DataPool,
+                              ObjectHandle OH) {
   // Decode ObjectHandle to locate the stored content.
   uint64_t Data = OH.getOpaqueData();
   if (Data & 1) {
     const auto *SDIM =
         reinterpret_cast<const StandaloneDataInMemory *>(Data & (-1ULL << 1));
-    return SDIM->getContent();
+    return SDIM;
   }
 
   auto DataHandle =
       cantFail(DataRecordHandle::getFromDataPool(DataPool, FileOffset(Data)));
   assert(DataHandle.getData().end()[0] == 0 && "Null termination");
-  return OnDiskContent{DataHandle, std::nullopt};
+  return DataHandle;
+}
+
+static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool,
+                                          ObjectHandle OH) {
+  auto SDIMOrRecord = getStandaloneDataOrDataRecord(DataPool, OH);
+  if (std::holds_alternative<const StandaloneDataInMemory *>(SDIMOrRecord)) {
+    return std::get<const StandaloneDataInMemory *>(SDIMOrRecord)->getContent();
+  } else {
+    auto DataHandle = std::get<DataRecordHandle>(std::move(SDIMOrRecord));
+    return OnDiskContent{std::move(DataHandle), std::nullopt};
+  }
 }
 
 ArrayRef<char> OnDiskGraphDB::getObjectData(ObjectHandle Node) const {
   OnDiskContent Content = getContentFromHandle(DataPool, Node);
-  if (Content.Bytes)
-    return *Content.Bytes;
-  assert(Content.Record && "Expected record or bytes");
-  return Content.Record->getData();
+  return Content.getData();
 }
 
 InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
@@ -1232,6 +1257,18 @@ InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
   return std::nullopt;
 }
 
+OnDiskGraphDB::FileBackedData
+OnDiskGraphDB::getInternalFileBackedObjectData(ObjectHandle Node) const {
+  auto SDIMOrRecord = getStandaloneDataOrDataRecord(DataPool, Node);
+  if (std::holds_alternative<const StandaloneDataInMemory *>(SDIMOrRecord)) {
+    auto *SDIM = std::get<const StandaloneDataInMemory *>(SDIMOrRecord);
+    return SDIM->getInternalFileBackedObjectData(RootPath);
+  } else {
+    auto DataHandle = std::get<DataRecordHandle>(std::move(SDIMOrRecord));
+    return FileBackedData{DataHandle.getData(), /*FileInfo=*/std::nullopt};
+  }
+}
+
 Expected<std::optional<ObjectHandle>>
 OnDiskGraphDB::load(ObjectID ExternalRef) {
   InternalRef Ref = getInternalRef(ExternalRef);
@@ -1269,7 +1306,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) {
   // suitably 0-padded. Requiring null-termination here would be too expensive
   // for extremely large objects that happen to be page-aligned.
   SmallString<256> Path;
-  getStandalonePath(TrieRecord::getStandaloneFilePrefix(Object.SK), *I, Path);
+  getStandalonePath(TrieRecord::getStandaloneFilePrefix(Object.SK), I->Offset,
+                    Path);
 
   auto BypassSandbox = sys::sandbox::scopedDisable();
 
@@ -1291,7 +1329,7 @@ OnDiskGraphDB::load(ObjectID ExternalRef) {
 
   return ObjectHandle::fromMemory(
       static_cast<StandaloneDataMapTy *>(StandaloneData)
-          ->insert(I->Hash, Object.SK, std::move(Region)));
+          ->insert(I->Hash, Object.SK, std::move(Region), I->Offset));
 }
 
 Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) {
@@ -1337,11 +1375,17 @@ InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) {
   return InternalRef::getFromOffset(IndexOffset);
 }
 
-void OnDiskGraphDB::getStandalonePath(StringRef Prefix, const IndexProxy &I,
-                                      SmallVectorImpl<char> &Path) const {
+static void getStandalonePath(StringRef RootPath, StringRef Prefix,
+                              FileOffset IndexOffset,
+                              SmallVectorImpl<char> &Path) {
   Path.assign(RootPath.begin(), RootPath.end());
   sys::path::append(Path,
-                    Prefix + Twine(I.Offset.get()) + "." + CASFormatVersion);
+                    Prefix + Twine(IndexOffset.get()) + "." + CASFormatVersion);
+}
+
+void OnDiskGraphDB::getStandalonePath(StringRef Prefix, FileOffset IndexOffset,
+                                      SmallVectorImpl<char> &Path) const {
+  return ::getStandalonePath(RootPath, Prefix, IndexOffset, Path);
 }
 
 OnDiskContent StandaloneDataInMemory::getContent() const {
@@ -1374,6 +1418,28 @@ OnDiskContent StandaloneDataInMemory::getContent() const {
   return OnDiskContent{Record, std::nullopt};
 }
 
+OnDiskGraphDB::FileBackedData
+StandaloneDataInMemory::getInternalFileBackedObjectData(
+    StringRef RootPath) const {
+  switch (SK) {
+  case TrieRecord::StorageKind::Unknown:
+  case TrieRecord::StorageKind::DataPool:
+    llvm_unreachable("unexpected storage kind");
+  case TrieRecord::StorageKind::Standalone:
+    return OnDiskGraphDB::FileBackedData{getContent().getData(),
+                                         /*FileInfo=*/std::nullopt};
+  case TrieRecord::StorageKind::StandaloneLeaf0:
+  case TrieRecord::StorageKind::StandaloneLeaf:
+    bool IsFileNulTerminated = SK == TrieRecord::StorageKind::StandaloneLeaf0;
+    SmallString<256> Path;
+    ::getStandalonePath(RootPath, TrieRecord::getStandaloneFilePrefix(SK),
+                        IndexOffset, Path);
+    return OnDiskGraphDB::FileBackedData{
+        getContent().getData(), OnDiskGraphDB::FileBackedData::FileInfoTy{
+                                    std::string(Path), IsFileNulTerminated}};
+  }
+}
+
 static Expected<MappedTempFile>
 createTempFile(StringRef FinalPath, uint64_t Size, OnDiskCASLogger *Logger) {
   auto BypassSandbox = sys::sandbox::scopedDisable();
@@ -1413,7 +1479,7 @@ Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data) {
 
   SmallString<256> Path;
   int64_t FileSize = Data.size() + Leaf0;
-  getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I, Path);
+  getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I.Offset, Path);
 
   auto BypassSandbox = sys::sandbox::scopedDisable();
 
@@ -1484,7 +1550,7 @@ Error OnDiskGraphDB::store(ObjectID ID, ArrayRef<ObjectID> Refs,
   auto AllocStandaloneFile = [&](size_t Size) -> Expected<char *> {
     getStandalonePath(TrieRecord::getStandaloneFilePrefix(
                           TrieRecord::StorageKind::Standalone),
-                      *I, Path);
+                      I->Offset, Path);
     if (Error E = createTempFile(Path, Size, Logger.get()).moveInto(File))
       return std::move(E);
     assert(File->size() == Size);
@@ -1567,6 +1633,117 @@ Error OnDiskGraphDB::store(ObjectID ID, ArrayRef<ObjectID> Refs,
   return Error::success();
 }
 
+Error OnDiskGraphDB::storeFile(ObjectID ID, StringRef FilePath) {
+  return storeFile(ID, FilePath, /*ImportKind=*/std::nullopt);
+}
+
+Error OnDiskGraphDB::storeFile(
+    ObjectID ID, StringRef FilePath,
+    std::optional<InternalUpstreamImportKind> ImportKind) {
+  auto I = getIndexProxyFromRef(getInternalRef(ID));
+  if (LLVM_UNLIKELY(!I))
+    return I.takeError();
+
+  // Early return in case the node exists.
+  {
+    TrieRecord::Data Existing = I->Ref.load();
+    if (Existing.SK != TrieRecord::StorageKind::Unknown)
+      return Error::success();
+  }
+
+  auto BypassSandbox = sys::sandbox::scopedDisable();
+
+  uint64_t FileSize;
+  if (std::error_code EC = sys::fs::file_size(FilePath, FileSize))
+    return createFileError(FilePath, EC);
+
+  if (FileSize <= TrieRecord::MaxEmbeddedSize) {
+    auto Buf = MemoryBuffer::getFile(FilePath);
+    if (!Buf)
+      return createFileError(FilePath, Buf.getError());
+    return store(ID, {}, arrayRefFromStringRef<char>((*Buf)->getBuffer()));
+  }
+
+  StringRef FromPath;
+  SmallString<256> TmpPath;
+
+  auto RemoveTmpFile = scope_exit([&TmpPath] {
+    if (!TmpPath.empty())
+      sys::fs::remove(TmpPath);
+  });
+
+  // \c clonefile requires that the destination path doesn't exist. We create
+  // a "placeholder" temporary file, then modify its path a bit and use that
+  // for \c clonefile to write to.
+  // FIXME: Instead of creating a dummy file, add a new file system API for
+  // copying to a unique path that can loop while checking EEXIST.
+  SmallString<256> UniqueTmpPath;
+  if (std::error_code EC =
+          sys::fs::createUniqueFile(RootPath + "/tmp.%%%%%%%", UniqueTmpPath))
+    return createFileError(RootPath + "/tmp.%%%%%%%", EC);
+  auto RemoveUniqueFile =
+      scope_exit([&UniqueTmpPath] { sys::fs::remove(UniqueTmpPath); });
+  TmpPath = UniqueTmpPath;
+  TmpPath += 'c'; // modify so that there's no file at that path.
+  // \c copy_file will use \c clonefile when applicable.
+  if (std::error_code EC = sys::fs::copy_file(FilePath, TmpPath))
+    return createFileError(FilePath, EC);
+  FromPath = TmpPath;
+
+  TrieRecord::StorageKind SK;
+  if (ImportKind.has_value()) {
+    // Importing the file from upstream, the nul is already added if necessary.
+    switch (*ImportKind) {
+    case InternalUpstreamImportKind::Leaf:
+      SK = TrieRecord::StorageKind::StandaloneLeaf;
+      break;
+    case InternalUpstreamImportKind::Leaf0:
+      SK = TrieRecord::StorageKind::StandaloneLeaf0;
+      break;
+    }
+  } else {
+    bool Leaf0 = isAligned(Align(getPageSize()), FileSize);
+    SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
+               : TrieRecord::StorageKind::StandaloneLeaf;
+
+    if (Leaf0) {
+      // Add a nul byte at the end.
+      std::error_code EC;
+      raw_fd_ostream OS(FromPath, EC, sys::fs::CD_OpenExisting,
+                        sys::fs::FA_Write, sys::fs::OF_Append);
+      if (EC)
+        return createFileError(FromPath, EC);
+      OS.write(0);
+      OS.close();
+      if (OS.has_error())
+        return createFileError(FromPath, OS.error());
+    }
+  }
+
+  SmallString<256> StandalonePath;
+  getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I->Offset,
+                    StandalonePath);
+  if (std::error_code EC = sys::fs::rename(FromPath, StandalonePath))
+    return createFileError(FromPath, EC);
+  TmpPath.clear();
+
+  // Store the object reference.
+  TrieRecord::Data Existing;
+  {
+    TrieRecord::Data Leaf{SK, FileOffset()};
+    if (I->Ref.compare_exchange_strong(Existing, Leaf)) {
+      recordStandaloneSizeIncrease(FileSize);
+      return Error::success();
+    }
+  }
+
+  // If there was a race, confirm that the new value has valid storage.
+  if (Existing.SK == TrieRecord::StorageKind::Unknown)
+    return createCorruptObjectError(getDigest(*I));
+
+  return Error::success();
+}
+
 void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) {
   standaloneStorageSize().fetch_add(SizeIncrease, std::memory_order_relaxed);
 }
@@ -1715,8 +1892,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
     UpstreamCursor &Cur = CursorStack.back();
     if (Cur.RefI == Cur.RefE) {
       // Copy the node data into the primary store.
-      // FIXME: Use hard-link or cloning if the file-system supports it and data
-      // is stored into a separate file.
 
       // The bottom of \p PrimaryNodesStack contains the primary ID for the
       // current node plus the list of imported referenced IDs.
@@ -1724,8 +1899,7 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
       ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1);
       auto PrimaryRefs = ArrayRef(PrimaryNodesStack)
                              .slice(PrimaryNodesStack.size() - Cur.RefsCount);
-      auto Data = UpstreamDB->getObjectData(Cur.Node);
-      if (Error E = store(PrimaryID, PrimaryRefs, Data))
+      if (Error E = importUpstreamData(PrimaryID, PrimaryRefs, Cur.Node))
         return E;
       // Remove the current node and its IDs from the stack.
       PrimaryNodesStack.truncate(PrimaryNodesStack.size() - Cur.RefsCount);
@@ -1760,10 +1934,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
                                       ObjectHandle UpstreamNode) {
   // Copies only a single node, it doesn't copy the referenced nodes.
 
-  // Copy the node data into the primary store.
-  // FIXME: Use hard-link or cloning if the file-system supports it and data is
-  // stored into a separate file.
-  auto Data = UpstreamDB->getObjectData(UpstreamNode);
   auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
   SmallVector<ObjectID, 64> Refs;
   Refs.reserve(llvm::size(UpstreamRefs));
@@ -1774,7 +1944,29 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
     Refs.push_back(*Ref);
   }
 
-  return store(PrimaryID, Refs, Data);
+  return importUpstreamData(PrimaryID, Refs, UpstreamNode);
+}
+
+Error OnDiskGraphDB::importUpstreamData(ObjectID PrimaryID,
+                                        ArrayRef<ObjectID> PrimaryRefs,
+                                        ObjectHandle UpstreamNode) {
+  // If there are references we can't copy an upstream's standalone file because
+  // we need to re-resolve the reference offsets it contains.
+  if (PrimaryRefs.empty()) {
+    auto FBData = UpstreamDB->getInternalFileBackedObjectData(UpstreamNode);
+    if (FBData.FileInfo.has_value()) {
+      // Disk-space optimization, import the file directly since it is a
+      // standalone leaf.
+      return storeFile(
+          PrimaryID, FBData.FileInfo->FilePath,
+          /*InternalUpstreamImport=*/FBData.FileInfo->IsFileNulTerminated
+              ? InternalUpstreamImportKind::Leaf0
+              : InternalUpstreamImportKind::Leaf);
+    }
+  }
+
+  auto Data = UpstreamDB->getObjectData(UpstreamNode);
+  return store(PrimaryID, PrimaryRefs, Data);
 }
 
 Expected<std::optional<ObjectHandle>>
diff --git a/llvm/unittests/CAS/BuiltinObjectHasherTest.cpp b/llvm/unittests/CAS/BuiltinObjectHasherTest.cpp
new file mode 100644
index 0000000000000..23243bd07a1bc
--- /dev/null
+++ b/llvm/unittests/CAS/BuiltinObjectHasherTest.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/Support/BLAKE3.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+using HasherT = BLAKE3;
+using HashType = BuiltinObjectHasher<HasherT>::HashT;
+
+TEST(BuiltinObjectHasherTest, Basic) {
+  unittest::TempFile TmpFile("somefile.o", /*Suffix=*/"", /*Contents=*/"",
+                             /*Unique=*/true);
+  {
+    std::error_code EC;
+    raw_fd_stream Out(TmpFile.path(), EC);
+    ASSERT_FALSE(EC);
+    SmallVector<char, 200> Data;
+    for (unsigned i = 1; i != 201; ++i) {
+      Data.push_back(i);
+    }
+    for (unsigned i = 0; i != 1000; ++i) {
+      Out.write(Data.data(), Data.size());
+    }
+  }
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFile(TmpFile.path());
+  ASSERT_TRUE(!!MB);
+  ASSERT_NE(*MB, nullptr);
+
+  HashType Hash1 =
+      BuiltinObjectHasher<HasherT>::hashObject({}, (*MB)->getBuffer());
+  std::optional<HashType> Hash2;
+  ASSERT_THAT_ERROR(
+      BuiltinObjectHasher<HasherT>::hashFile(TmpFile.path()).moveInto(Hash2),
+      Succeeded());
+  EXPECT_EQ(Hash1, *Hash2);
+}
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index 11def9be5e04c..1bee2324d06c0 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(ONDISK_CAS_TEST_SOURCES
+  BuiltinObjectHasherTest.cpp
   BuiltinUnifiedCASDatabasesTest.cpp
   OnDiskCASLoggerTest.cpp
   OnDiskGraphDBTest.cpp
diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h
index 48a1830f9b219..770f5acbc4749 100644
--- a/llvm/unittests/CAS/OnDiskCommonUtils.h
+++ b/llvm/unittests/CAS/OnDiskCommonUtils.h
@@ -46,6 +46,21 @@ inline HashType digest(StringRef Data) {
   return HasherT::hash(arrayRefFromStringRef(Data));
 }
 
+inline HashType digestFile(StringRef FilePath) {
+  std::optional<HashType> Digest;
+  EXPECT_THAT_ERROR(
+      BuiltinObjectHasher<HasherT>::hashFile(FilePath).moveInto(Digest),
+      Succeeded());
+  return *Digest;
+}
+
+inline ObjectID digestFile(OnDiskGraphDB &DB, StringRef FilePath) {
+  HashType Digest = digestFile(FilePath);
+  std::optional<ObjectID> ID;
+  EXPECT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded());
+  return *ID;
+}
+
 inline ValueType valueFromString(StringRef S) {
   ValueType Val = {};
   llvm::copy(S.substr(0, sizeof(Val)), Val.data());
diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
index a4df3c5f6f2b3..4ecb6b6a1e864 100644
--- a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
+++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
@@ -8,6 +8,9 @@
 
 #include "CASTestConfig.h"
 #include "OnDiskCommonUtils.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Testing/Support/Error.h"
 #include "llvm/Testing/Support/SupportHelpers.h"
 #include "gtest/gtest.h"
@@ -293,6 +296,211 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInPolicyConflict) {
                            OnDiskGraphDB::FaultInPolicy::SingleNode);
 }
 
+static std::unique_ptr<unittest::TempFile> createLargeFile(char initChar) {
+  auto TmpFile = std::make_unique<unittest::TempFile>(
+      "largefile.o", /*Suffix=*/"", /*Contents=*/"",
+      /*Unique=*/true);
+  StringRef Path = TmpFile->path();
+  std::error_code EC;
+  raw_fd_stream Out(Path, EC);
+  EXPECT_FALSE(EC);
+  SmallString<200> Data;
+  Data += initChar;
+  for (unsigned i = 1; i != 200; ++i) {
+    Data += i;
+  }
+  for (unsigned i = 0; i != 1000; ++i) {
+    Out.write(Data.data(), Data.size());
+  }
+  return TmpFile;
+}
+
+static std::unique_ptr<unittest::TempFile>
+createLargePageAlignedFile(char initChar) {
+  auto TmpFile = std::make_unique<unittest::TempFile>(
+      "largepagealignedfile.o", /*Suffix=*/"", /*Contents=*/"",
+      /*Unique=*/true);
+  StringRef Path = TmpFile->path();
+  std::error_code EC;
+  raw_fd_stream Out(Path, EC);
+  EXPECT_FALSE(EC);
+  SmallString<256> Data;
+  Data += initChar;
+  for (unsigned i = 1; i != sys::Process::getPageSizeEstimate(); ++i) {
+    Data += char(i);
+  }
+  for (unsigned i = 0; i != 64; ++i) {
+    Out.write(Data.data(), Data.size());
+  }
+  Out.close();
+  uint64_t FileSize;
+  EC = sys::fs::file_size(Path, FileSize);
+  EXPECT_FALSE(EC);
+  assert(isAligned(Align(sys::Process::getPageSizeEstimate()), FileSize));
+  return TmpFile;
+}
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInLargeFile) {
+  auto runCommonTests =
+      [](function_ref<std::unique_ptr<unittest::TempFile>(char)> createFileFn) {
+        unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+        std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+        ASSERT_THAT_ERROR(
+            OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+                .moveInto(UpstreamDB),
+            Succeeded());
+
+        auto TmpFile = createFileFn('a');
+        auto Path = TmpFile->path();
+        HashType FileDigest = digestFile(Path);
+        std::optional<ObjectID> UpstrID;
+        ASSERT_THAT_ERROR(
+            UpstreamDB->getReference(FileDigest).moveInto(UpstrID),
+            Succeeded());
+        ASSERT_THAT_ERROR(UpstreamDB->storeFile(*UpstrID, Path), Succeeded());
+
+        unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+        std::unique_ptr<OnDiskGraphDB> DB;
+        ASSERT_THAT_ERROR(
+            OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+                                UpstreamDB.get(), /*Logger=*/nullptr,
+                                OnDiskGraphDB::FaultInPolicy::SingleNode)
+                .moveInto(DB),
+            Succeeded());
+
+        std::optional<ObjectID> ID1;
+        ASSERT_THAT_ERROR(DB->getReference(FileDigest).moveInto(ID1),
+                          Succeeded());
+        std::optional<ondisk::ObjectHandle> Obj;
+        ASSERT_THAT_ERROR(DB->load(*ID1).moveInto(Obj), Succeeded());
+        ASSERT_TRUE(Obj.has_value());
+
+        std::optional<ObjectID> ID2;
+        ASSERT_THAT_ERROR(
+            store(*DB, toStringRef(DB->getObjectData(*Obj)), {}).moveInto(ID2),
+            Succeeded());
+        ASSERT_TRUE(ID2.has_value());
+        EXPECT_EQ(*ID1, *ID2);
+      };
+
+  runCommonTests(createLargeFile);
+  runCommonTests(createLargePageAlignedFile);
+}
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBFileAPIs) {
+  unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> DB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+      Succeeded());
+
+  SmallVector<std::unique_ptr<unittest::TempFile>, 4> TempFiles;
+
+  // Create a file with small size and and controlling the initial byte so
+  // caller can create different contents.
+  auto createSmallFile = [&TempFiles](char initChar) -> StringRef {
+    TempFiles.push_back(std::make_unique<unittest::TempFile>(
+        "smallfile.o", /*Suffix=*/"", /*Contents=*/"",
+        /*Unique=*/true));
+    StringRef Path = TempFiles.back()->path();
+    std::error_code EC;
+    raw_fd_stream Out(Path, EC);
+    EXPECT_FALSE(EC);
+    SmallString<200> Data;
+    Data += initChar;
+    for (unsigned i = 1; i != 200; ++i) {
+      Data += i;
+    }
+    Out.write(Data.data(), Data.size());
+    return Path;
+  };
+
+  auto createLargeFile = [&TempFiles](char initChar) -> StringRef {
+    TempFiles.push_back(::createLargeFile(initChar));
+    return TempFiles.back()->path();
+  };
+
+  auto createLargePageAlignedFile = [&TempFiles](char initChar) -> StringRef {
+    TempFiles.push_back(::createLargePageAlignedFile(initChar));
+    return TempFiles.back()->path();
+  };
+
+  auto runCommonTests =
+      [&DB](function_ref<StringRef(char)> createFileFn,
+            function_ref<void(const OnDiskGraphDB::FileBackedData &FBD)>
+                additionalChecks) {
+        {
+          auto FilePath = createFileFn('a');
+          ObjectID ID1 = digestFile(*DB, FilePath);
+          ASSERT_THAT_ERROR(DB->storeFile(ID1, FilePath), Succeeded());
+          EXPECT_TRUE(sys::fs::exists(FilePath));
+
+          std::optional<ondisk::ObjectHandle> Obj;
+          ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded());
+          EXPECT_TRUE(DB->getObjectRefs(*Obj).empty());
+          ArrayRef<char> Contents = DB->getObjectData(*Obj);
+          EXPECT_EQ(Contents.data()[Contents.size()], '\0');
+          ObjectID ID2 = digest(*DB, toStringRef(Contents), {});
+          EXPECT_EQ(ID1, ID2);
+
+          auto FBD = DB->getInternalFileBackedObjectData(*Obj);
+          EXPECT_EQ(FBD.Data, Contents);
+          additionalChecks(FBD);
+        }
+      };
+
+  auto checkSmallFile = [](const OnDiskGraphDB::FileBackedData &FBD) {
+    EXPECT_FALSE(FBD.FileInfo.has_value());
+  };
+
+  auto checkLargeFile = [](const OnDiskGraphDB::FileBackedData &FBD) {
+    ASSERT_TRUE(FBD.FileInfo.has_value());
+    EXPECT_FALSE(FBD.FileInfo->IsFileNulTerminated);
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+        MemoryBuffer::getFile(FBD.FileInfo->FilePath);
+    ASSERT_TRUE(!!MB);
+    ASSERT_NE(*MB, nullptr);
+    EXPECT_EQ((*MB)->getBuffer(), toStringRef(FBD.Data));
+  };
+
+  auto checkLargePageAlignedFile =
+      [](const OnDiskGraphDB::FileBackedData &FBD) {
+        ASSERT_TRUE(FBD.FileInfo.has_value());
+        EXPECT_TRUE(FBD.FileInfo->IsFileNulTerminated);
+        ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+            MemoryBuffer::getFile(FBD.FileInfo->FilePath);
+        ASSERT_TRUE(!!MB);
+        ASSERT_NE(*MB, nullptr);
+        EXPECT_EQ((*MB)->getBuffer().back(), '\0');
+        EXPECT_EQ((*MB)->getBuffer().drop_back(1), toStringRef(FBD.Data));
+      };
+
+  runCommonTests(createSmallFile, checkSmallFile);
+  runCommonTests(createLargeFile, checkLargeFile);
+  runCommonTests(createLargePageAlignedFile, checkLargePageAlignedFile);
+
+  // Check non-leaf node.
+  {
+    std::optional<ObjectID> ID1;
+    ASSERT_THAT_ERROR(store(*DB, "hello", {}).moveInto(ID1), Succeeded());
+
+    auto Path = createLargeFile('r');
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MB = MemoryBuffer::getFile(Path);
+    ASSERT_TRUE(!!MB);
+    ASSERT_NE(*MB, nullptr);
+    std::optional<ObjectID> ID2;
+    ASSERT_THAT_ERROR(store(*DB, (*MB)->getBuffer(), *ID1).moveInto(ID2),
+                      Succeeded());
+
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(*ID2).moveInto(Obj), Succeeded());
+    ArrayRef<char> Contents = DB->getObjectData(*Obj);
+    auto FBD = DB->getInternalFileBackedObjectData(*Obj);
+    EXPECT_EQ(FBD.Data, Contents);
+    EXPECT_FALSE(FBD.FileInfo.has_value());
+  }
+}
+
 #if defined(EXPENSIVE_CHECKS) && !defined(_WIN32)
 TEST_F(OnDiskCASTest, OnDiskGraphDBSpaceLimit) {
   setMaxOnDiskCASMappingSize();

>From a6a039b6351a6eb9f3bd8858b677ce5241eceb76 Mon Sep 17 00:00:00 2001
From: Jan Voung <jvoung at google.com>
Date: Fri, 6 Feb 2026 16:30:14 -0500
Subject: [PATCH 07/13] [clang][analysis][dataflow] Detect goto backedges to
 trigger Widen (#179546)

Currently, the Clang Dataflow Framework only does Widen on backedges
from structured loops.

Missing some Widen calls (e.g., when there are backedges from gotos)
could cause some analyses to iterate ~forever (until the max visits
limit is hit).

This adds a simple search for backedges, and triggers Widen on the
additional backedge nodes. Fixes [issue 179083.
](https://github.com/llvm/llvm-project/issues/179083)
---
 clang/include/clang/Analysis/CFGBackEdges.h   |  54 +++
 clang/lib/Analysis/CFGBackEdges.cpp           | 105 ++++++
 clang/lib/Analysis/CMakeLists.txt             |   1 +
 .../TypeErasedDataflowAnalysis.cpp            |  19 +-
 clang/unittests/Analysis/CFGBackEdgesTest.cpp | 312 ++++++++++++++++++
 clang/unittests/Analysis/CMakeLists.txt       |   1 +
 .../TypeErasedDataflowAnalysisTest.cpp        |  67 +++-
 7 files changed, 543 insertions(+), 16 deletions(-)
 create mode 100644 clang/include/clang/Analysis/CFGBackEdges.h
 create mode 100644 clang/lib/Analysis/CFGBackEdges.cpp
 create mode 100644 clang/unittests/Analysis/CFGBackEdgesTest.cpp

diff --git a/clang/include/clang/Analysis/CFGBackEdges.h b/clang/include/clang/Analysis/CFGBackEdges.h
new file mode 100644
index 0000000000000..d7cbe5818fc6a
--- /dev/null
+++ b/clang/include/clang/Analysis/CFGBackEdges.h
@@ -0,0 +1,54 @@
+//===- CFGBackEdges.h - Finds back edges in Clang CFGs -*- C++ ----------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_CFG_BACKEDGES_H
+#define LLVM_CLANG_ANALYSIS_CFG_BACKEDGES_H
+
+#include "clang/Analysis/CFG.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace clang {
+
+/// Finds and returns back edges in Clang CFGs. The CFG already has some
+/// backedge information for structured loops (\c CFGBlock::getLoopTarget).
+/// However, unstructured back edges from \c goto statements are not included.
+/// This helps find back edges, whether the CFG is reducible or not.
+/// This includes CFGBlock::getLoopTarget nodes, but one can filter those out
+/// e.g., with \c findNonStructuredLoopBackedgeNodes.
+llvm::DenseMap<const CFGBlock *, const CFGBlock *>
+findCFGBackEdges(const CFG &CFG);
+
+/// Returns a set of CFG blocks that is the source of a backedge and is not
+/// tracked as part of a structured loop (with `CFGBlock::getLoopTarget`).
+llvm::SmallDenseSet<const CFGBlock *>
+findNonStructuredLoopBackedgeNodes(const CFG &CFG);
+
+/// Given a backedge from B1 to B2, B1 is a "backedge node" in a CFG.
+/// It can be:
+/// - A block introduced in the CFG exclusively to indicate a structured loop's
+///   backedge. They are exactly identified by the presence of a non-null
+///   pointer to the entry block of the loop condition. Note that this is not
+///   necessarily the block with the loop statement as terminator, because
+///   short-circuit operators will result in multiple blocks encoding the loop
+///   condition, only one of which will contain the loop statement as
+///   terminator.
+/// - A block that is part of a backedge in a CFG with unstructured loops
+///   (e.g., a CFG with a `goto` statement). Note that this is not necessarily
+///   the block with the goto statement as terminator. The choice depends on how
+///   blocks and edges are ordered.
+///
+/// \param NonStructLoopBackedgeNodes is the set of nodes from
+/// \c findNonStructuredLoopBackedgeNodes.
+bool isBackedgeCFGNode(
+    const CFGBlock &B,
+    const llvm::SmallDenseSet<const CFGBlock *> &NonStructLoopBackedgeNodes);
+
+} // namespace clang
+
+#endif // LLVM_CLANG_ANALYSIS_CFG_BACKEDGES_H
diff --git a/clang/lib/Analysis/CFGBackEdges.cpp b/clang/lib/Analysis/CFGBackEdges.cpp
new file mode 100644
index 0000000000000..9018d1a594ed2
--- /dev/null
+++ b/clang/lib/Analysis/CFGBackEdges.cpp
@@ -0,0 +1,105 @@
+//===- CFGBackEdges.cpp - Finds back edges in Clang CFGs ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stack>
+#include <utility>
+#include <vector>
+
+#include "clang/Analysis/CFG.h"
+#include "clang/Analysis/CFGBackEdges.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace clang {
+
+namespace {
+struct VisitClockTimes {
+  // Timestamp for when the node was visited / discovered.
+  int Pre = -1;
+  // Timestamp for when we finished visiting a node's successors.
+  int Post = -1;
+};
+} // namespace
+
+// Returns true if the CFG contains any goto statements (direct or indirect).
+static bool hasGotoInCFG(const CFG &CFG) {
+  for (const CFGBlock *Block : CFG) {
+    const Stmt *Term = Block->getTerminatorStmt();
+    if (Term == nullptr)
+      continue;
+    if (isa<GotoStmt>(Term) || isa<IndirectGotoStmt>(Term))
+      return true;
+  }
+  return false;
+}
+
+llvm::DenseMap<const CFGBlock *, const CFGBlock *>
+findCFGBackEdges(const CFG &CFG) {
+  // Do a simple textbook DFS with pre and post numberings to find back edges.
+  llvm::DenseMap<const CFGBlock *, const CFGBlock *> BackEdges;
+
+  std::vector<VisitClockTimes> VisitState;
+  VisitState.resize(CFG.getNumBlockIDs());
+  std::stack<std::pair<const CFGBlock *, CFGBlock::const_succ_iterator>>
+      DFSStack;
+  int Clock = 0;
+  const CFGBlock &Entry = CFG.getEntry();
+  VisitState[Entry.getBlockID()].Pre = Clock++;
+  DFSStack.push({&Entry, Entry.succ_begin()});
+
+  while (!DFSStack.empty()) {
+    auto &[Block, SuccIt] = DFSStack.top();
+    if (SuccIt == Block->succ_end()) {
+      VisitState[Block->getBlockID()].Post = Clock++;
+      DFSStack.pop();
+      continue;
+    }
+
+    const CFGBlock::AdjacentBlock &AdjacentSucc = *SuccIt++;
+    const CFGBlock *Succ = AdjacentSucc.getReachableBlock();
+    // Skip unreachable blocks.
+    if (Succ == nullptr)
+      continue;
+
+    VisitClockTimes &SuccVisitState = VisitState[Succ->getBlockID()];
+    if (SuccVisitState.Pre != -1) {
+      if (SuccVisitState.Post == -1)
+        BackEdges.insert({Block, Succ});
+    } else {
+      SuccVisitState.Pre = Clock++;
+      DFSStack.push({Succ, Succ->succ_begin()});
+    }
+  }
+  return BackEdges;
+}
+
+// Returns a set of CFG blocks that is the source of a backedge and is not
+// tracked as part of a structured loop (with `CFGBlock::getLoopTarget`).
+llvm::SmallDenseSet<const CFGBlock *>
+findNonStructuredLoopBackedgeNodes(const CFG &CFG) {
+  llvm::SmallDenseSet<const CFGBlock *> NonStructLoopBackedgeNodes;
+  // We should only need this if the function has gotos.
+  if (!hasGotoInCFG(CFG))
+    return NonStructLoopBackedgeNodes;
+
+  llvm::DenseMap<const CFGBlock *, const CFGBlock *> Backedges =
+      findCFGBackEdges(CFG);
+  for (const auto &[From, To] : Backedges) {
+    if (From->getLoopTarget() == nullptr)
+      NonStructLoopBackedgeNodes.insert(From);
+  }
+  return NonStructLoopBackedgeNodes;
+}
+
+bool isBackedgeCFGNode(
+    const CFGBlock &B,
+    const llvm::SmallDenseSet<const CFGBlock *> &NonStructLoopBackedgeNodes) {
+  return B.getLoopTarget() != nullptr ||
+         NonStructLoopBackedgeNodes.contains(&B);
+}
+
+} // namespace clang
diff --git a/clang/lib/Analysis/CMakeLists.txt b/clang/lib/Analysis/CMakeLists.txt
index 65f160e965d47..fef688424978d 100644
--- a/clang/lib/Analysis/CMakeLists.txt
+++ b/clang/lib/Analysis/CMakeLists.txt
@@ -9,6 +9,7 @@ add_clang_library(clangAnalysis
   BodyFarm.cpp
   CalledOnceCheck.cpp
   CFG.cpp
+  CFGBackEdges.cpp
   CFGReachabilityAnalysis.cpp
   CFGStmtMap.cpp
   CallGraph.cpp
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 1113bbe7f4d9c..02982274093cb 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -19,18 +19,23 @@
 #include "clang/AST/ASTDumper.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/OperationKinds.h"
+#include "clang/AST/Stmt.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/Analysis/Analyses/PostOrderCFGView.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/CFGBackEdges.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
 #include "clang/Analysis/FlowSensitive/DataflowWorklist.h"
 #include "clang/Analysis/FlowSensitive/Transfer.h"
 #include "clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
+#include "clang/Basic/LLVM.h"
 #include "clang/Support/Compiler.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
@@ -64,16 +69,6 @@ static int blockIndexInPredecessor(const CFGBlock &Pred,
   return BlockPos - Pred.succ_begin();
 }
 
-// A "backedge" node is a block introduced in the CFG exclusively to indicate a
-// loop backedge. They are exactly identified by the presence of a non-null
-// pointer to the entry block of the loop condition. Note that this is not
-// necessarily the block with the loop statement as terminator, because
-// short-circuit operators will result in multiple blocks encoding the loop
-// condition, only one of which will contain the loop statement as terminator.
-static bool isBackedgeNode(const CFGBlock &B) {
-  return B.getLoopTarget() != nullptr;
-}
-
 namespace {
 
 /// Extracts the terminator's condition expression.
@@ -503,6 +498,8 @@ runTypeErasedDataflowAnalysis(
   const clang::CFG &CFG = ACFG.getCFG();
   PostOrderCFGView POV(&CFG);
   ForwardDataflowWorklist Worklist(CFG, &POV);
+  llvm::SmallDenseSet<const CFGBlock *> NonStructLoopBackedgeNodes =
+      findNonStructuredLoopBackedgeNodes(CFG);
 
   std::vector<std::optional<TypeErasedDataflowAnalysisState>> BlockStates(
       CFG.size());
@@ -537,7 +534,7 @@ runTypeErasedDataflowAnalysis(
         llvm::errs() << "Old Env:\n";
         OldBlockState->Env.dump();
       });
-      if (isBackedgeNode(*Block)) {
+      if (isBackedgeCFGNode(*Block, NonStructLoopBackedgeNodes)) {
         LatticeJoinEffect Effect1 = Analysis.widenTypeErased(
             NewBlockState.Lattice, OldBlockState->Lattice);
         LatticeJoinEffect Effect2 =
diff --git a/clang/unittests/Analysis/CFGBackEdgesTest.cpp b/clang/unittests/Analysis/CFGBackEdgesTest.cpp
new file mode 100644
index 0000000000000..1bf699c65ec81
--- /dev/null
+++ b/clang/unittests/Analysis/CFGBackEdgesTest.cpp
@@ -0,0 +1,312 @@
+//===- unittests/Analysis/CFGBackEdgesTest.cpp - CFG backedges tests ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/CFGBackEdges.h"
+#include "CFGBuildResult.h"
+#include "clang/AST/Stmt.h"
+#include "clang/Analysis/CFG.h"
+#include "clang/Basic/LLVM.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace analysis {
+namespace {
+
+using ::testing::IsNull;
+using ::testing::NotNull;
+using ::testing::SizeIs;
+
+TEST(CFGBackEdgesTest, NoBackedgesLinear) {
+  const char *Code = R"cc(
+    int f(int x) {
+      l1:
+        x++;
+      l2:
+        x++;
+      return x;
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_TRUE(BackEdges.empty());
+}
+
+TEST(CFGBackEdgesTest, NoBackedgesOnlyCrossEdge) {
+  const char *Code = R"cc(
+    int f(int x) {
+      if (x > 0)
+        x++;
+      else
+        x--;
+      return x;
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_TRUE(BackEdges.empty());
+}
+
+TEST(CFGBackEdgesTest, NoBackedgesWithUnreachableSuccessorForSwitch) {
+  const char *Code = R"cc(
+    enum class Kind { A, B };
+
+    void f(Kind kind) {
+      switch(kind) {
+      case Kind::A: return 0;
+      case Kind::B: break;
+      }
+      return 1;
+  })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_TRUE(BackEdges.empty());
+}
+
+TEST(CFGBackEdgesTest, ForLoop) {
+  const char *Code = R"cc(
+    void f(int n) {
+      for (int i = 0; i < n; ++i) {}
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  // Finds one backedge, which is the one looping back to the loop header
+  // (has a loop target).
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_THAT(BackEdges, SizeIs(1));
+  EXPECT_THAT(BackEdges.begin()->first->getLoopTarget(), NotNull());
+}
+
+TEST(CFGBackEdgesTest, WhileLoop) {
+  const char *Code = R"cc(
+    void f(int n) {
+      int i = 0;
+      while (i < n) { ++i; }
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_THAT(BackEdges, SizeIs(1));
+  EXPECT_THAT(BackEdges.begin()->first->getLoopTarget(), NotNull());
+}
+
+TEST(CFGBackEdgesTest, DoWhileLoop) {
+  const char *Code = R"cc(
+    void f(int n) {
+      int i = 0;
+      do { ++i; } while (i < n);
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_THAT(BackEdges, SizeIs(1));
+  EXPECT_THAT(BackEdges.begin()->first->getLoopTarget(), NotNull());
+}
+
+TEST(CFGBackEdgesTest, GotoLoop) {
+  const char *Code = R"cc(
+    void f(int n) {
+      int i = 0;
+    loop:
+      if (i < n) {
+        ++i;
+        goto loop;
+      }
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  // Finds one backedge, but since it's an unstructured loop, the loop target is
+  // null. Instead, the node has a goto terminator.
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_THAT(BackEdges, SizeIs(1));
+  EXPECT_THAT(BackEdges.begin()->first->getLoopTarget(), IsNull());
+  EXPECT_TRUE(isa<GotoStmt>(BackEdges.begin()->first->getTerminatorStmt()));
+}
+
+TEST(CFGBackEdgesTest, WhileWithContinueLoop) {
+  const char *Code = R"cc(
+    void f(int n) {
+      int i = 0;
+      while (i < n) {
+        ++i;
+        if (i == 5) continue;
+        if (i == 10) break;
+        i *= 2;
+      }
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_THAT(BackEdges, SizeIs(testing::Gt(0)));
+  for (const auto &[From, To] : BackEdges)
+    EXPECT_THAT(From->getLoopTarget(), NotNull());
+}
+
+TEST(CFGBackEdgesTest, NestedForLoop) {
+  const char *Code = R"cc(
+    void f(int n) {
+      for (int i = 0; i < n; ++i) {
+        for (int j = i; j < n; ++j) {}
+      }
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_THAT(BackEdges, SizeIs(2));
+  auto It = BackEdges.begin();
+  auto *FirstLoopTarget = It->first->getLoopTarget();
+  EXPECT_THAT(FirstLoopTarget, NotNull());
+  ++It;
+  auto *SecondLoopTarget = It->first->getLoopTarget();
+  EXPECT_THAT(SecondLoopTarget, NotNull());
+  EXPECT_NE(FirstLoopTarget, SecondLoopTarget);
+}
+
+TEST(CFGBackEdgesTest, IrreducibleCFG) {
+  const char *Code = R"cc(
+    void f(int cond) {
+      if (cond) goto L1;
+    L0:
+      goto L1;
+    L1:
+      goto L0;
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  // In an irreducible CFG, we still expect to find a back edge.
+  EXPECT_THAT(BackEdges, SizeIs(1));
+  EXPECT_TRUE(isa<GotoStmt>(BackEdges.begin()->first->getTerminatorStmt()));
+}
+
+TEST(CFGBackEdgesTest, FirstBackedgeIsNotGoto) {
+  const char *Code = R"cc(
+    void f(int x, int y) {
+      if (x > y) {
+      } else {
+      L1:
+        --x;
+        if (x == 0) return;
+      }
+      goto L1;
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+
+  auto BackEdges = findCFGBackEdges(*Cfg);
+  EXPECT_THAT(BackEdges, SizeIs(1));
+  // We might find a backedge where the source block doesn't terminate with
+  // a `goto`, due to the DFS search order. For example:
+  //
+  // B_entry: `if (x > y)`
+  //   \--then--> B1: `<empty>`
+  //      --> B2: `goto L1`
+  //        --> B3: `--x; if (x == 0)`
+  //          \--then--> B4 `return` --> B_exit.
+  //          \--else--> B2: ... (the `if`'s else is a backedge from B3 to B2!)
+  //   \--else--> B3: ...
+  EXPECT_FALSE(isa<GotoStmt>(BackEdges.begin()->first->getTerminatorStmt()));
+}
+
+TEST(CFGBackEdgesTest, FindNonStructuredLoopBackedgeNodes) {
+  const char *Code = R"cc(
+    void f(int n) {
+      for (int i = 0; i < n; ++i) {
+        int j = 0;
+        inner_loop:
+        if (j < n) {
+          ++j;
+          goto inner_loop;
+        }
+      }
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  // Finds just the goto backedge, and not the for-loop backedge.
+  auto BackEdgeNodes = findNonStructuredLoopBackedgeNodes(*Cfg);
+  EXPECT_THAT(BackEdgeNodes, SizeIs(1));
+  const CFGBlock *Node = *BackEdgeNodes.begin();
+  EXPECT_EQ(Node->getLoopTarget(), nullptr);
+  EXPECT_TRUE(isa<GotoStmt>(Node->getTerminatorStmt()));
+}
+
+TEST(CFGBackEdgesTest, IsBackedgeCFGNode) {
+  const char *Code = R"cc(
+    void f(int n) {
+      for (int i = 0; i < n; ++i) {
+        int j = 0;
+        inner_loop:
+        if (j < n) {
+          ++j;
+          goto inner_loop;
+        }
+      }
+    })cc";
+  BuildResult Result = BuildCFG(Code);
+  EXPECT_EQ(BuildResult::BuiltCFG, Result.getStatus());
+  CFG *Cfg = Result.getCFG();
+  ASSERT_THAT(Cfg, NotNull());
+
+  auto BackEdgeNodes = findNonStructuredLoopBackedgeNodes(*Cfg);
+
+  // `isBackedgeCFGNode` should be true for both the for-loop backedge node and
+  // goto backedge nodes.
+  const CFGBlock *ForLoopBackedgeNode = nullptr;
+  const CFGBlock *GotoBackedgeNode = nullptr;
+  for (const CFGBlock *Block : *Cfg) {
+    if (Block->getLoopTarget() != nullptr) {
+      ForLoopBackedgeNode = Block;
+    } else if (Block->getTerminatorStmt() != nullptr &&
+               isa<GotoStmt>(Block->getTerminatorStmt())) {
+      GotoBackedgeNode = Block;
+    }
+  }
+  ASSERT_THAT(ForLoopBackedgeNode, NotNull());
+  ASSERT_THAT(GotoBackedgeNode, NotNull());
+  EXPECT_TRUE(isBackedgeCFGNode(*ForLoopBackedgeNode, BackEdgeNodes));
+  EXPECT_TRUE(isBackedgeCFGNode(*GotoBackedgeNode, BackEdgeNodes));
+}
+
+} // namespace
+} // namespace analysis
+} // namespace clang
diff --git a/clang/unittests/Analysis/CMakeLists.txt b/clang/unittests/Analysis/CMakeLists.txt
index 97e768b11db69..7cefa2caf3c91 100644
--- a/clang/unittests/Analysis/CMakeLists.txt
+++ b/clang/unittests/Analysis/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_clang_unittest(ClangAnalysisTests
+  CFGBackEdgesTest.cpp
   CFGDominatorTree.cpp
   CFGTest.cpp
   CloneDetectionTest.cpp
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index d1dd4ff3ea33e..3a64bbdf52702 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -1145,6 +1145,34 @@ TEST_F(WideningTest, DistinctValuesWithDifferentPropertiesWidenedToTop) {
       });
 }
 
+TEST_F(WideningTest,
+       DistinctValuesWithDifferentPropertiesWidenedToTopGotoInsteadOfWhile) {
+  std::string Code = R"cc(
+    void target(bool Cond) {
+      int *Foo;
+      int i = 0;
+      Foo = nullptr;
+      start:
+      if (Cond) {
+        Foo = &i;
+        goto start;
+      }
+      (void)0;
+      /*[[p]]*/
+    }
+  )cc";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+        const auto &FooVal = getValueForDecl<Value>(ASTCtx, Env, "Foo");
+        ASSERT_THAT(FooVal.getProperty("is_null"), NotNull());
+        EXPECT_TRUE(areEquivalentValues(*FooVal.getProperty("is_null"),
+                                        Env.makeTopBoolValue()));
+      });
+}
+
 class FlowConditionTest : public Test {
 protected:
   template <typename Matcher>
@@ -1253,18 +1281,47 @@ TEST_F(FlowConditionTest, WhileStmt) {
 }
 
 TEST_F(FlowConditionTest, WhileStmtWithAssignmentInCondition) {
-  std::string Code = R"(
+  std::string Code = R"cc(
+    bool getBool();
+
     void target(bool Foo) {
       // This test checks whether the analysis preserves the connection between
       // the value of `Foo` and the assignment expression, despite widening.
-      // The equality operator generates a fresh boolean variable on each
-      // interpretation, which forces use of widening.
-      while ((Foo = (3 == 4))) {
+      // The return value of getBool() should have a fresh boolean variable on
+      // each interpretation, which forces use of widening.
+      while (Foo = getBool()) {
         (void)0;
         /*[[p]]*/
       }
     }
-  )";
+  )cc";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+        auto &FooVal = getValueForDecl<BoolValue>(ASTCtx, Env, "Foo").formula();
+        EXPECT_TRUE(Env.proves(FooVal));
+      });
+}
+
+TEST_F(FlowConditionTest, GotoLoopWithAssignmentInCondition) {
+  std::string Code = R"cc(
+    bool getBool();
+
+    void target(bool Foo) {
+      // This test checks whether the analysis preserves the connection between
+      // the value of `Foo` and the assignment expression, despite widening.
+      // The return value of getBool() should have a fresh boolean variable on
+      // each interpretation, which forces use of widening.
+      start:
+      if (Foo = getBool()) {
+        (void)0;
+        /*[[p]]*/
+        goto start;
+      }
+    }
+  )cc";
   runDataflow(
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,

>From aa2d1688bded08570f36d81d577906203a0fcc91 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Fri, 6 Feb 2026 21:31:48 +0000
Subject: [PATCH 08/13] [gn build] Port 28042a87022b

---
 llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
index 976f6dec25b72..b01eeed41814f 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
@@ -68,6 +68,7 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonGenMemAbsolute.cpp",
     "HexagonGenMux.cpp",
     "HexagonGenPredicate.cpp",
+    "HexagonGlobalRegion.cpp",
     "HexagonHardwareLoops.cpp",
     "HexagonHazardRecognizer.cpp",
     "HexagonISelDAGToDAG.cpp",

>From 819974e2ee3026c578037504c514da68f6625234 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Fri, 6 Feb 2026 21:31:49 +0000
Subject: [PATCH 09/13] [gn build] Port 60ecb3789606

---
 llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn       | 1 +
 llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
index 547ea818df82b..8579e8b0a9c32 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
@@ -4,6 +4,7 @@ static_library("CAS") {
     "ActionCache.cpp",
     "ActionCaches.cpp",
     "BuiltinCAS.cpp",
+    "BuiltinObjectHasher.cpp",
     "BuiltinUnifiedCASDatabases.cpp",
     "CASNodeSchema.cpp",
     "DatabaseFile.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
index cbffe9838e58c..904a3cfbd2136 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
@@ -9,6 +9,7 @@ unittest("CASTests") {
   ]
   sources = [
     "ActionCacheTest.cpp",
+    "BuiltinObjectHasherTest.cpp",
     "CASTestConfig.cpp",
     "NamedValuesSchemaTest.cpp",
     "ObjectStoreTest.cpp",

>From c624f143d8465acf3cdebaf5f4d94e96efe6f889 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Fri, 6 Feb 2026 21:31:50 +0000
Subject: [PATCH 10/13] [gn build] Port 66f9ffbb2c28

---
 llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
index b01eeed41814f..c9ad9b6ceab61 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
@@ -76,6 +76,7 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonISelLowering.cpp",
     "HexagonISelLoweringHVX.cpp",
     "HexagonInstrInfo.cpp",
+    "HexagonLiveVariables.cpp",
     "HexagonLoadStoreWidening.cpp",
     "HexagonLoopAlign.cpp",
     "HexagonLoopIdiomRecognition.cpp",

>From b128fae61ca64c38a7e8e82eb6c85ec50f8a695d Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Fri, 6 Feb 2026 21:31:52 +0000
Subject: [PATCH 11/13] [gn build] Port ec15bddde5cb

---
 llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn       | 1 +
 llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
index 9b0618ded23a1..a893296f1f64f 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
@@ -15,6 +15,7 @@ static_library("Analysis") {
     "AnnexKDetection.cpp",
     "BodyFarm.cpp",
     "CFG.cpp",
+    "CFGBackEdges.cpp",
     "CFGReachabilityAnalysis.cpp",
     "CFGStmtMap.cpp",
     "CallGraph.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn
index ac2ce0c59c6b3..4bf4cddf98f36 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn
@@ -13,6 +13,7 @@ unittest("ClangAnalysisTests") {
     "//llvm/lib/Support",
   ]
   sources = [
+    "CFGBackEdgesTest.cpp",
     "CFGDominatorTree.cpp",
     "CFGTest.cpp",
     "CloneDetectionTest.cpp",

>From 35386372426acd34d0adfa558be9ac69ae93e825 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Fri, 6 Feb 2026 21:31:52 +0000
Subject: [PATCH 12/13] [gn build] Port eff21afae01f

---
 llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
index b8a9d92f8dd30..73ed834599e02 100644
--- a/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/TargetParser/BUILD.gn
@@ -10,7 +10,6 @@ static_library("TargetParser") {
     "AArch64TargetParser.cpp",
     "ARMTargetParser.cpp",
     "ARMTargetParserCommon.cpp",
-    "AVRTargetParser.cpp",
     "CSKYTargetParser.cpp",
     "Host.cpp",
     "LoongArchTargetParser.cpp",

>From c62af2a0a181d8ca8e0fb21d48464f93cc80e6a3 Mon Sep 17 00:00:00 2001
From: Christopher Di Bella <cjdb at google.com>
Date: Fri, 6 Feb 2026 21:45:41 +0000
Subject: [PATCH 13/13] fixes accidental revert

---
 libcxx/include/__vector/vector.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 37e46bf30fc6a..70e2d8b26f0f4 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -807,7 +807,7 @@ class vector {
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector&, false_type) _NOEXCEPT {}
 
   template <class _Ptr = pointer, __enable_if_t<is_pointer<_Ptr>::value, int> = 0>
-  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI pointer
+  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI _Ptr
   __add_alignment_assumption(_Ptr __p) _NOEXCEPT {
     if (!__libcpp_is_constant_evaluated()) {
       return static_cast<pointer>(__builtin_assume_aligned(__p, _LIBCPP_ALIGNOF(decltype(*__p))));
@@ -816,7 +816,7 @@ class vector {
   }
 
   template <class _Ptr = pointer, __enable_if_t<!is_pointer<_Ptr>::value, int> = 0>
-  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI pointer
+  static _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI _Ptr
   __add_alignment_assumption(_Ptr __p) _NOEXCEPT {
     return __p;
   }