[llvm-branch-commits] [clang] [clang-tools-extra] [compiler-rt] [libcxx] [lldb] [llvm] [mlir] [mlir][SCF] Fold unused `index_switch` results (PR #173560)

Matthias Springer via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Sun Dec 28 10:24:24 PST 2025


=?utf-8?b?6ZmI5a2Q5piC?= <2802328816 at qq.com>,NAKAMURA Takumi
 <geek4civic at gmail.com>,NAKAMURA Takumi <geek4civic at gmail.com>,mitchell
 <mitchell.xu2 at gmail.com>,lonely eagle <2020382038 at qq.com>,aokblast
 <aokblast at FreeBSD.org>,NAKAMURA Takumi <geek4civic at gmail.com>,Steven Perron
 <stevenperron at google.com>,Victor Chernyakin <chernyakin.victor.j at outlook.com>
 =?utf-8?q?,?=Florian Hahn <flo at fhahn.com>,Muhammad Abdul
 <alilo.ghazali at gmail.com>,Craig Topper <craig.topper at sifive.com>,paperchalice
 <liujunchang97 at outlook.com>,Twice <twice at apache.org>,Yunbo Ni
 <87902024+cardigan1008 at users.noreply.github.com>,Ben Shi <2283975856 at qq.com>,Tobias
 Gysi <tobias.gysi at nextsilicon.com>,sskzakaria <ssskzakaria at proton.me>,Mahesh-Attarde
 <mahesh.attarde at intel.com>,Dhruva Narayan K <dhruvakodiadka at gmail.com>,Eduardo
 Tachotte <bfwaend at gmail.com>,Owen Anderson <resistor at mac.com>,MetalOxideSemi
 <43286339+MetalOxideSemi at users.noreply.github.com>,Matthias Springer
 <me at m-sp.org>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/173560 at github.com>


https://github.com/matthias-springer updated https://github.com/llvm/llvm-project/pull/173560

>From 64496be8e0b07bbade36e68f70ee2ff002cfd0a8 Mon Sep 17 00:00:00 2001
From: peledins-zimperium
 <146088545+peledins-zimperium at users.noreply.github.com>
Date: Fri, 26 Dec 2025 14:03:25 +0200
Subject: [PATCH 01/34] [mlir] Fix typo s/opreations/operations (#163544)

---
 mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h             | 2 +-
 .../mlir/Dialect/Transform/Interfaces/TransformInterfaces.h     | 2 +-
 mlir/test/Examples/transform/ChH/full.mlir                      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
index 5b7b45fdd1d58..82f86d06886b6 100644
--- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
@@ -505,7 +505,7 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis {
 
 /// A dense backward dataflow analysis propagating lattices after and before the
 /// execution of every operation across the IR by implementing transfer
-/// functions for opreations.
+/// functions for operations.
 ///
 /// `LatticeT` is expected to be a subclass of `AbstractDenseLattice`.
 template <typename LatticeT>
diff --git a/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h
index b9f2af22e9483..8b0517a84a675 100644
--- a/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h
+++ b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h
@@ -700,7 +700,7 @@ class TransformState {
   ///  - `throughValue` is the payload value the handle to which is consumed,
   ///     when it is the case, null when the operation handle is consumed
   ///     directly.
-  /// Looks at the payload opreations associated with `otherHandle` and if any
+  /// Looks at the payload operations associated with `otherHandle` and if any
   /// of these operations has an ancestor (or is itself) listed in
   /// `potentialAncestors`, records the error message describing the use of the
   /// invalidated handle. Does nothing if `otherHandle` already has a reporter
diff --git a/mlir/test/Examples/transform/ChH/full.mlir b/mlir/test/Examples/transform/ChH/full.mlir
index d49524b529a1d..1293406d72475 100644
--- a/mlir/test/Examples/transform/ChH/full.mlir
+++ b/mlir/test/Examples/transform/ChH/full.mlir
@@ -275,7 +275,7 @@ module attributes { transform.with_named_sequence } {
 
     // Vectorize the remaining non-unit dimensions in structured operations.
     // This essentially rewrites operations on `tensor<5x64xf32>` into
-    // opreations on `vector<5x64xf32>`. Further lowering in MLIR and LLVM will
+    // operations on `vector<5x64xf32>`. Further lowering in MLIR and LLVM will
     // decompose this into a sequence of operations on single-dimensional
     // vectors of the platform-relevant size, e.g., `vector<16xf32>` for AVX512.
     // High-level vector primitives, such as `vector.transpose` and

>From d5d49b8d737bdf7723c1de6899da83c46c2db5ab Mon Sep 17 00:00:00 2001
From: Suriyaa MM <143200860+SuriyaaMM at users.noreply.github.com>
Date: Fri, 26 Dec 2025 17:42:05 +0530
Subject: [PATCH 02/34] [mlir][bufferization] Return early in aliasing analysis
  (#173529)

Fix for [mlir-opt crashes in OneShotModuleBufferize.cpp:139
#173371](https://github.com/llvm/llvm-project/issues/173371#issue-3757290676).

It just returns `failure()` if there is no `func.return` op.
---
 .../Bufferization/Transforms/OneShotModuleBufferize.cpp     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
index c233e24c2a151..fab45dfca125f 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp
@@ -136,7 +136,11 @@ aliasingFuncOpBBArgsAnalysis(FuncOp funcOp, OneShotAnalysisState &state,
 
   // Find all func.return ops.
   SmallVector<func::ReturnOp> returnOps = getReturnOps(funcOp);
-  assert(!returnOps.empty() && "expected at least one ReturnOp");
+  // TODO: throw error when there is any non-func.return op that has the
+  // ReturnLike trait
+  if (returnOps.empty()) {
+    return funcOp.emitError("cannot bufferize func.func without func.return");
+  }
 
   // Build alias sets. Merge all aliases from all func.return ops.
   for (BlockArgument bbArg : funcOp.getArguments()) {

>From 3b31f8ca1aa76f03db7146109da0b436fe54ef26 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm at gmail.com>
Date: Fri, 26 Dec 2025 16:36:17 +0200
Subject: [PATCH 03/34] [libc++][ranges] Applied `[[nodiscard]]` to
 `filter_view` (#173460)

`[[nodiscard]]` should be applied to functions where discarding the
return value is most likely a correctness issue.

- https://libcxx.llvm.org/CodingGuidelines.html
- https://wg21.link/range.filter

Towards #172124
---
 libcxx/include/__ranges/filter_view.h         | 20 ++---
 .../range.filter/nodiscard.verify.cpp         | 78 +++++++++++++++++++
 2 files changed, 88 insertions(+), 10 deletions(-)
 create mode 100644 libcxx/test/libcxx/ranges/range.adaptors/range.filter/nodiscard.verify.cpp

diff --git a/libcxx/include/__ranges/filter_view.h b/libcxx/include/__ranges/filter_view.h
index 07980e7353190..3ad69ea100931 100644
--- a/libcxx/include/__ranges/filter_view.h
+++ b/libcxx/include/__ranges/filter_view.h
@@ -76,16 +76,16 @@ class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS filter_view : public view_interface<f
       : __base_(std::move(__base)), __pred_(in_place, std::move(__pred)) {}
 
   template <class _Vp = _View>
-  _LIBCPP_HIDE_FROM_ABI constexpr _View base() const&
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _View base() const&
     requires copy_constructible<_Vp>
   {
     return __base_;
   }
-  _LIBCPP_HIDE_FROM_ABI constexpr _View base() && { return std::move(__base_); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _View base() && { return std::move(__base_); }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr _Pred const& pred() const { return *__pred_; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Pred const& pred() const { return *__pred_; }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr __iterator begin() {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __iterator begin() {
     // Note: this duplicates a check in `optional` but provides a better error message.
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __pred_.__has_value(), "Trying to call begin() on a filter_view that does not have a valid predicate.");
@@ -99,7 +99,7 @@ class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS filter_view : public view_interface<f
     }
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr auto end() {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto end() {
     if constexpr (common_range<_View>)
       return __iterator{*this, ranges::end(__base_)};
     else
@@ -148,10 +148,10 @@ class filter_view<_View, _Pred>::__iterator : public __filter_iterator_category<
   _LIBCPP_HIDE_FROM_ABI constexpr __iterator(filter_view& __parent, iterator_t<_View> __current)
       : __current_(std::move(__current)), __parent_(std::addressof(__parent)) {}
 
-  _LIBCPP_HIDE_FROM_ABI constexpr iterator_t<_View> const& base() const& noexcept { return __current_; }
-  _LIBCPP_HIDE_FROM_ABI constexpr iterator_t<_View> base() && { return std::move(__current_); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr iterator_t<_View> const& base() const& noexcept { return __current_; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr iterator_t<_View> base() && { return std::move(__current_); }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr range_reference_t<_View> operator*() const { return *__current_; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr range_reference_t<_View> operator*() const { return *__current_; }
   _LIBCPP_HIDE_FROM_ABI constexpr iterator_t<_View> operator->() const
     requires __has_arrow<iterator_t<_View>> && copyable<iterator_t<_View>>
   {
@@ -194,7 +194,7 @@ class filter_view<_View, _Pred>::__iterator : public __filter_iterator_category<
     return __x.__current_ == __y.__current_;
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend constexpr range_rvalue_reference_t<_View>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI friend constexpr range_rvalue_reference_t<_View>
   iter_move(__iterator const& __it) noexcept(noexcept(ranges::iter_move(__it.__current_))) {
     return ranges::iter_move(__it.__current_);
   }
@@ -218,7 +218,7 @@ class filter_view<_View, _Pred>::__sentinel {
 
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(filter_view& __parent) : __end_(ranges::end(__parent.__base_)) {}
 
-  _LIBCPP_HIDE_FROM_ABI constexpr sentinel_t<_View> base() const { return __end_; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr sentinel_t<_View> base() const { return __end_; }
 
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(__iterator const& __x, __sentinel const& __y) {
     return __x.__current_ == __y.__end_;
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.filter/nodiscard.verify.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.filter/nodiscard.verify.cpp
new file mode 100644
index 0000000000000..0aaf9aec6cc29
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.filter/nodiscard.verify.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++20
+
+// Check that functions are marked [[nodiscard]]
+
+#include <ranges>
+#include <utility>
+
+struct NonCommonView : std::ranges::view_base {
+  int* begin() const;
+  const int* end() const;
+
+  int* base();
+
+  int* begin();
+  const int* end();
+};
+static_assert(!std::ranges::common_range<NonCommonView>);
+
+void test() {
+  NonCommonView range;
+  auto pred = [](int) { return true; };
+
+  auto v = std::views::filter(range, pred);
+
+  // [range.filter.view]
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  v.base();
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::move(v).base();
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  v.pred();
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  v.begin();
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  v.end();
+
+  // [range.filter.iterator]
+
+  auto it = v.begin();
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  it.base();
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::move(it).base();
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  iter_move(it);
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  *it;
+
+  // [range.filter.sentinel]
+
+  auto st = v.end();
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  st.base();
+
+  // [range.filter.overview]
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::views::filter(range, pred);
+
+  // expected-warning at +1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::views::filter(pred);
+}

>From c2441689830fcb2588673dedba98da1219a2fb9e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 26 Dec 2025 06:54:02 -0800
Subject: [PATCH 04/34] Revert "[SLP]Enable float point math ops as copyables
 elements."

This reverts commit 48be4d07c3ca045fe831cbdf216631202c55cd62
to investigate crashes reported in https://github.com/llvm/llvm-project/commit/2568ec6cb29da3db5bd7c848ec53a673c1431aea#commitcomment-173523022.
---
 llvm/include/llvm/IR/Instruction.h            |   6 -
 llvm/include/llvm/IR/IntrinsicInst.h          |   6 -
 llvm/lib/IR/Instruction.cpp                   |   7 -
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  90 +++---------
 .../AArch64/shuffle-vectors-mask-size.ll      |   6 +-
 .../X86/bv-root-part-of-graph.ll              |  11 +-
 .../SLPVectorizer/X86/crash_smallpt.ll        |  22 ++-
 .../entry-no-bundle-but-extra-use-on-vec.ll   |  35 ++---
 .../extractelement-single-use-many-nodes.ll   |   3 +-
 .../X86/multi-node-for-copyable-parent.ll     |  15 +-
 .../X86/multi-node-user-with-copyable-ops.ll  |  19 ++-
 .../non-commutative-op-in-commutative-inst.ll |  12 +-
 .../SLPVectorizer/X86/propagate-mmra.ll       |   4 +-
 .../reused-last-instruction-in-split-node.ll  |  24 ++-
 .../X86/same-operands-but-copyable.ll         |   2 +-
 .../X86/user-with-multi-copyable-ops.ll       |  44 +++---
 .../X86/vect_copyable_in_binops.ll            | 128 ++++++++++++----
 .../SLPVectorizer/alternate-non-profitable.ll |  11 +-
 .../SLPVectorizer/crash_exceed_scheduling.ll  | 138 ++++++------------
 .../extract-many-users-buildvector.ll         |  72 ++++-----
 .../SLPVectorizer/insertelement-postpone.ll   |  40 ++---
 21 files changed, 344 insertions(+), 351 deletions(-)

diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 11385666e7ff8..2eb4fd36c5b7d 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -762,12 +762,6 @@ class Instruction : public User,
   /// applied to any type.
   ///
   LLVM_ABI bool isCommutative() const LLVM_READONLY;
-
-  /// Checks if the operand is commutative. In commutative operations, not all
-  /// operands might commutable, e.g. for fmuladd only 2 first operands are
-  /// commutable.
-  LLVM_ABI bool isCommutableOperand(unsigned Op) const LLVM_READONLY;
-
   static bool isCommutative(unsigned Opcode) {
     switch (Opcode) {
     case Add: case FAdd:
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 0b25baa465a71..0622bfae2c845 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -101,12 +101,6 @@ class IntrinsicInst : public CallInst {
     }
   }
 
-  /// Return true if the operand is commutable.
-  bool isCommutableOperand(unsigned Op) const {
-    constexpr unsigned NumCommutativeOps = 2;
-    return isCommutative() && Op < NumCommutativeOps;
-  }
-
   /// Checks if the intrinsic is an annotation.
   bool isAssumeLikeIntrinsic() const {
     switch (getIntrinsicID()) {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 7682c28e23b33..f3d4d2424fe5b 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1293,13 +1293,6 @@ bool Instruction::isCommutative() const {
   return isCommutative(getOpcode());
 }
 
-bool Instruction::isCommutableOperand(unsigned Op) const {
-  if (auto *II = dyn_cast<IntrinsicInst>(this))
-    return II->isCommutableOperand(Op);
-  // TODO: Should allow icmp/fcmp?
-  return isCommutative(getOpcode());
-}
-
 unsigned Instruction::getNumSuccessors() const {
   switch (getOpcode()) {
 #define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c700fef5ecd8f..4af20f0e1838b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -575,27 +575,6 @@ static bool isCommutative(Instruction *I, Value *ValWithUses,
   return I->isCommutative();
 }
 
-/// Checks if the operand is commutative. In commutative operations, not all
-/// operands might commutable, e.g. for fmuladd only 2 first operands are
-/// commutable.
-static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
-                                bool IsCopyable = false) {
-  assert(::isCommutative(I, ValWithUses, IsCopyable) &&
-         "The instruction is not commutative.");
-  if (isa<CmpInst>(I))
-    return true;
-  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
-    switch (BO->getOpcode()) {
-    case Instruction::Sub:
-    case Instruction::FSub:
-      return true;
-    default:
-      break;
-    }
-  }
-  return I->isCommutableOperand(Op);
-}
-
 /// This is a helper function to check whether \p I is commutative.
 /// This is a convenience wrapper that calls the two-parameter version of
 /// isCommutative with the same instruction for both parameters. This is
@@ -5349,14 +5328,13 @@ class slpvectorizer::BoUpSLP {
       if (ScheduleCopyableDataMap.empty())
         return false;
       SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
+      SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
       ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
       if (Entries.empty())
         return false;
-      unsigned CurNumOps = 0;
       for (const Use &U : User->operands()) {
         if (U.get() != Op)
           continue;
-        ++CurNumOps;
         // Check all tree entries, if they have operands replaced by copyable
         // data.
         for (TreeEntry *TE : Entries) {
@@ -5389,43 +5367,27 @@ class slpvectorizer::BoUpSLP {
           // Same applies even for non-commutative cmps, because we can invert
           // their predicate potentially and, thus, reorder the operands.
           bool IsCommutativeUser =
-              ::isCommutative(User) &&
-              ::isCommutableOperand(User, User, U.getOperandNo());
-          if (!IsCommutativeUser) {
-            Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
-            IsCommutativeUser =
-                ::isCommutative(MainOp, User) &&
-                ::isCommutableOperand(MainOp, User, U.getOperandNo());
-          }
-          // The commutative user with the same operands can be safely
-          // considered as non-commutative, operands reordering does not change
-          // the semantics.
-          assert(
-              (!IsCommutativeUser ||
-               (((::isCommutative(User) &&
-                  ::isCommutableOperand(User, User, 0) &&
-                  ::isCommutableOperand(User, User, 1)) ||
-                 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
-                  ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
-                                        User, 0) &&
-                  ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
-                                        User, 1))))) &&
-              "Expected commutative user with 2 first commutable operands");
-          bool IsCommutativeWithSameOps =
-              IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
-          if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
-              !isa<CmpInst>(User)) {
+              ::isCommutative(User) ||
+              ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
+          if (!IsCommutativeUser && !isa<CmpInst>(User)) {
+            unsigned &OpCnt =
+                OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
             EdgeInfo EI(TE, U.getOperandNo());
-            if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
+            if (!getScheduleCopyableData(EI, Op))
               continue;
-            return false;
+            // Found copyable operand - continue.
+            OpCnt += Inc;
+            continue;
           }
           PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
               .first->getSecond() += Inc;
         }
       }
       if (PotentiallyReorderedEntriesCount.empty())
-        return true;
+        return all_of(OrderedEntriesCount,
+                      [&](const std::pair<const TreeEntry *, unsigned> &P) {
+                        return P.second == NumOps;
+                      });
       // Check the commutative/cmp entries.
       for (auto &P : PotentiallyReorderedEntriesCount) {
         SmallPtrSet<Value *, 4> ParentsUniqueUsers;
@@ -5471,6 +5433,10 @@ class slpvectorizer::BoUpSLP {
       return all_of(PotentiallyReorderedEntriesCount,
                     [&](const std::pair<const TreeEntry *, unsigned> &P) {
                       return P.second == NumOps - 1;
+                    }) &&
+             all_of(OrderedEntriesCount,
+                    [&](const std::pair<const TreeEntry *, unsigned> &P) {
+                      return P.second == NumOps;
                     });
     }
 
@@ -5681,22 +5647,17 @@ class slpvectorizer::BoUpSLP {
                 auto It = OperandsUses.find(I);
                 assert(It != OperandsUses.end() && "Operand not found");
                 if (It->second > 0) {
+                  --It->getSecond();
+                  assert(TotalOpCount > 0 && "No more operands to decrement");
+                  --TotalOpCount;
                   if (ScheduleData *OpSD = getScheduleData(I)) {
                     if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
                       return;
-                    --It->getSecond();
-                    assert(TotalOpCount > 0 && "No more operands to decrement");
-                    --TotalOpCount;
                     DecrUnsched(OpSD, /*IsControl=*/false);
-                  } else {
-                    --It->getSecond();
-                    assert(TotalOpCount > 0 && "No more operands to decrement");
-                    --TotalOpCount;
                   }
                 }
               };
 
-          SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
@@ -5704,6 +5665,7 @@ class slpvectorizer::BoUpSLP {
             // Need to search for the lane since the tree entry can be
             // reordered.
             auto *It = find(Bundle->getTreeEntry()->Scalars, In);
+            SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
             bool IsNonSchedulableWithParentPhiNode =
                 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
                 Bundle->getTreeEntry()->UserTreeIndex &&
@@ -10913,9 +10875,7 @@ class InstructionsCompatibilityAnalysis {
            Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
            Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
            Opcode == Instruction::And || Opcode == Instruction::Or ||
-           Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
-           Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
-           Opcode == Instruction::FDiv;
+           Opcode == Instruction::Xor;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -11256,10 +11216,6 @@ class InstructionsCompatibilityAnalysis {
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
-      case Instruction::FAdd:
-      case Instruction::FMul:
-      case Instruction::FSub:
-      case Instruction::FDiv:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
index 961662c664a31..0783a28f56d85 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
@@ -11,10 +11,10 @@ define void @p(double %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> <i32 1, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> <double 1.000000e+00, double 1.000000e+00, double poison, double poison>, <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x double> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
index 1abc16da77c8e..0cc4d3db5c537 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
@@ -4,16 +4,15 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP9:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[FMUL:%.*]] = sitofp i32 0 to float
-; CHECK-NEXT:    [[SITOFP:%.*]] = sitofp i32 0 to float
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[SITOFP]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 0, i32 0, i32 poison, i32 7>
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FMUL]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index c1cc3f2dfc9e5..d13a8578d1e00 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -7,30 +7,36 @@
 define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    br label [[INVOKE_CONT:%.*]]
 ; CHECK:       invoke.cont:
-; CHECK-NEXT:    br i1 [[ARG]], label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
+; CHECK-NEXT:    br i1 %arg, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
 ; CHECK:       arrayctor.cont:
 ; CHECK-NEXT:    [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER:%.*]]
 ; CHECK:       for.cond36.preheader:
-; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
 ; CHECK:       cond.false51.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true48.us:
-; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
 ; CHECK:       cond.false66.us:
-; CHECK-NEXT:    store <2 x double> <double 0x404900049667B5F2, double 0x404E0515D587DA7B>, ptr undef, align 8
-; CHECK-NEXT:    store <2 x double> <double 2.000000e-07, double 0x3F91A436DC4B6CE6>, ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
+; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, 0x3EB0C6F7A0B5ED8D
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double poison, double 0xBFA5CC2D1960285F>, double [[ADD_I276_US]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> <double 0.000000e+00, double 1.000000e-01>, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], splat (double 1.400000e+02)
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr undef, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 2.000000e-01, double 3.000000e-01>, [[TMP1]]
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
 ; CHECK-NEXT:    ret void
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.body42.lr.ph.us:
-; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
 ; CHECK:       _Z5clampd.exit.1:
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER]]
 ;
@@ -90,7 +96,7 @@ _Z5clampd.exit.1:
 define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
 ; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    store <2 x double> <double 0x3FFA356C1D8A7F76, double 0x3FFDC4F38B38BEF4>, ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index ca65ff88a4b81..6d713e83bbf4e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -9,38 +9,33 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[DIV_2_I_I:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP20]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 0x7FF8000000000000, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 1.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[FA_SROA_9_0:%.*]] = phi float [ [[DIV_2_I_I]], %[[IF_THEN]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x float> [ [[TMP10]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP22]], float [[FA_SROA_9_0]], i32 1
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP28]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x float> <float 1.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP32:%.*]] = fmul <4 x float> <float -0.000000e+00, float -0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP31]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fadd <4 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP32]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[CALL25]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 91ec61b275205..6942df532ae29 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -25,7 +25,8 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]]
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[I82]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
index fd7f0c61b6737..a07e617384e09 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
@@ -6,17 +6,14 @@ define i1 @test(double %circ_radius, ptr %x) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double poison, double poison>, double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd <4 x double> <double -0.000000e+00, double 0.000000e+00, double 1.000000e+00, double -0.000000e+00>, [[TMP13]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[ADD20]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> <double poison, double poison, double 0.000000e+00, double poison>, <4 x i32> <i32 1, i32 2, i32 6, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP8]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp olt <4 x double> [[TMP9]], splat (double 1.000000e+00)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
index a9baedef3e509..eb3b183fd49eb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
@@ -6,18 +6,17 @@ define i1 @test(double %circ_radius, ptr %x, double %0) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]], double [[TMP0:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 1
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP16]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], <double -0.000000e+00, double 0.000000e+00, double -0.000000e+00, double -0.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD20]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double 0.000000e+00, double 1.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x double> [[TMP9]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP7]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP12]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x double> [[TMP13]], splat (double 1.000000e+00)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
index b71dbc49e7478..8c684325f8c68 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
@@ -8,11 +8,13 @@ define void @test(ptr %quat, float %call13) {
 ; CHECK-SAME: ptr [[QUAT:%.*]], float [[CALL13:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL121:%.*]] = load volatile float, ptr null, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[CALL121]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP2]], <float 0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fmuladd.f32(float [[CALL13]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[CALL121]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> zeroinitializer, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[QUAT]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll b/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
index a84c6ae6b0980..ba52ef4c462a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
@@ -5,9 +5,9 @@ define void @foo() {
 ; CHECK-LABEL: define void @foo() {
 ; CHECK-NEXT:  [[_PREHEADER16_PREHEADER:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4, !mmra [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 1.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP4]], <2 x float> zeroinitializer
 ; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr null, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
index 6dc9806da0aa9..f101991648276 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
@@ -4,7 +4,9 @@
 define float @test() {
 ; CHECK-LABEL: define float @test() {
 ; CHECK-NEXT:  [[LABEL:.*]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float 0.000000e+00, i32 0
+; CHECK-NEXT:    [[SUB_I102_I:%.*]] = fsub float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float poison, float poison>, float [[SUB_I102_I]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -12,12 +14,26 @@ define float @test() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <8 x float> [[TMP7]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> <float poison, float 1.000000e+00>, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> zeroinitializer, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <12 x float> [[TMP16]], <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <8 x float> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> [[TMP20]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <12 x float> [[TMP22]], <12 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP21]], <8 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <20 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <20 x float> [[TMP23]], <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
 ; CHECK-NEXT:    br label %[[REGION_30:.*]]
 ; CHECK:       [[REGION_30]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP10]], %[[LABEL]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP25]], %[[LABEL]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <20 x float> [[TMP26]], i32 7
 ; CHECK-NEXT:    ret float [[TMP27]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
index f1031937180a3..3645ad89af624 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
@@ -7,10 +7,10 @@ define void @test(ptr %0, ptr %1, float %.sroa.3232.0.copyload) {
 ; CHECK-NEXT:  [[BB:.*:]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 12
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP3]], <float 0.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP3]], float [[DOTSROA_3232_0_COPYLOAD]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP5]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP5]], [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
index c58c63e51737c..7b298723d93b5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
@@ -11,23 +11,30 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[ARRAYIDX_I1464:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX_I1464]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[THIS]], align 8
+; CHECK-NEXT:    [[DIV251:%.*]] = fmul double [[TMP1]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL257:%.*]] = fmul double [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL305:%.*]] = fmul double [[TMP4]], 0.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
+; CHECK-NEXT:    [[NEG356:%.*]] = fmul double [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG356]], double 0.000000e+00, double 0.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[THIS]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg double [[TMP3]]
 ; CHECK-NEXT:    [[NEG380:%.*]] = fmul double [[TMP1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double [[MUL257]])
 ; CHECK-NEXT:    [[FNEG381:%.*]] = fneg double [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[MUL257]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = fneg <2 x double> [[TMP11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[DIV251]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FNEG381]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP12]], [[TMP14]]
+; CHECK-NEXT:    [[NEG417:%.*]] = fneg double [[MUL257]]
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG417]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[FNEG418:%.*]] = fneg double [[TMP16]]
+; CHECK-NEXT:    [[MUL419:%.*]] = fmul double [[DIV251]], [[FNEG418]]
 ; CHECK-NEXT:    [[NEG436:%.*]] = fmul double [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> <double 1.000000e+00, double poison>, double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP14]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = fneg <2 x double> [[TMP17]]
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[FNEG437:%.*]] = fneg double [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fneg double [[TMP4]]
 ; CHECK-NEXT:    [[NEG455:%.*]] = fmul double [[TMP1]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG455]], double 0.000000e+00, double [[MUL305]])
@@ -35,18 +42,19 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[FNEG474:%.*]] = fneg double [[TMP20]]
 ; CHECK-NEXT:    [[NEG492:%.*]] = fneg double [[MUL305]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG492]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> <double 1.000000e+00, double 0.000000e+00>, [[TMP13]]
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP23]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x double> poison, double [[DIV251]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x double> poison, double [[FNEG437]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> [[TMP24]], double [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[FNEG474]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP21]], i32 3
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul <4 x double> [[TMP23]], [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x double> poison, double [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> [[TMP29]], double [[FNEG381]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x double> [[TMP25]], double [[TMP10]], i32 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP28]], double [[TMP19]], i32 5
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x double> [[TMP32]], double [[FNEG474]], i32 6
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x double> [[TMP33]], double [[TMP21]], i32 7
-; CHECK-NEXT:    [[TMP34:%.*]] = fmul <8 x double> [[TMP31]], [[TMP22]]
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x double> [[TMP29]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[MUL419]], i32 3
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x double> [[TMP28]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP35:%.*]] = fptrunc <8 x double> [[TMP34]] to <8 x float>
 ; CHECK-NEXT:    store <8 x float> [[TMP35]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index d10d26671e76b..2a0e7889f0f34 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
 
 define void @add0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @add0(
@@ -336,12 +336,32 @@ entry:
 }
 
 define void @add1f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @add1f(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @add1f(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd fast <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @add1f(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -367,9 +387,18 @@ entry:
 define void @sub0f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -536,9 +565,18 @@ entry:
 define void @mulf(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -593,12 +631,32 @@ entry:
 }
 
 define void @add1fn(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @add1fn(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @add1fn(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @add1fn(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -624,9 +682,18 @@ entry:
 define void @sub0fn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0fn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -713,9 +780,18 @@ entry:
 define void @mulfn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulfn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index b23da5fa263f6..125c2dce32663 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -52,12 +52,11 @@ define <2 x float> @replace_through_casts_and_binop(i16 %inp) {
 ; CHECK-SAME: i16 [[INP:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[INP]], -10
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i16 [[INP]], 5
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[MUL]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> [[TMP1]], i16 [[ADD]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[TMP5]], <float 2.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i16 [[MUL]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 2.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = sitofp i16 [[ADD]] to float
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i64 1
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %add = add nsw i16 %inp, -10
diff --git a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
index c79969de6ac41..793d089404d1e 100644
--- a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
@@ -1,98 +1,52 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
 
 define void @exceed(double %0, double %1) {
-; X86-LABEL: @exceed(
-; X86-NEXT:  entry:
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; X86-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; X86-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; X86-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; X86-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; X86-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; X86-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
-; X86-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
-; X86-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; X86-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
-; X86-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
-; X86-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
-; X86-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
-; X86-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
-; X86-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
-; X86-NEXT:    switch i32 undef, label [[BB1:%.*]] [
-; X86-NEXT:      i32 0, label [[BB2:%.*]]
-; X86-NEXT:    ]
-; X86:       bb1:
-; X86-NEXT:    br label [[LABEL:%.*]]
-; X86:       bb2:
-; X86-NEXT:    br label [[LABEL]]
-; X86:       label:
-; X86-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
-; X86-NEXT:    ret void
-;
-; AARCH64-LABEL: @exceed(
-; AARCH64-NEXT:  entry:
-; AARCH64-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; AARCH64-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; AARCH64-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; AARCH64-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; AARCH64-NEXT:    [[IX2:%.*]] = fmul double [[TMP7]], [[TMP7]]
-; AARCH64-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
-; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> <double poison, double 1.000000e+00>, <2 x i32> <i32 0, i32 3>
-; AARCH64-NEXT:    [[TMP11:%.*]] = fdiv fast <2 x double> [[TMP9]], [[TMP10]]
-; AARCH64-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; AARCH64-NEXT:    [[IX:%.*]] = fmul double [[TMP12]], undef
-; AARCH64-NEXT:    [[IX1:%.*]] = fmul double [[TMP12]], undef
-; AARCH64-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
-; AARCH64-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], [[TMP8]]
-; AARCH64-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP11]], undef
-; AARCH64-NEXT:    switch i32 undef, label [[BB1:%.*]] [
-; AARCH64-NEXT:      i32 0, label [[BB2:%.*]]
-; AARCH64-NEXT:    ]
-; AARCH64:       bb1:
-; AARCH64-NEXT:    br label [[LABEL:%.*]]
-; AARCH64:       bb2:
-; AARCH64-NEXT:    br label [[LABEL]]
-; AARCH64:       label:
-; AARCH64-NEXT:    [[TMP16:%.*]] = phi <2 x double> [ [[TMP14]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
-; AARCH64-NEXT:    ret void
+; CHECK-LABEL: @exceed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
+; CHECK-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
+; CHECK-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
+; CHECK-NEXT:    switch i32 undef, label [[BB1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[LABEL:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[LABEL]]
+; CHECK:       label:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
+; CHECK-NEXT:    ret void
 ;
 entry:
   %i10 = fdiv fast double %0, %1
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 439943102b58a..32e59697486a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -7,52 +7,56 @@ define i1 @test(float %0, double %1) {
 ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; X86-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
-; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
-; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
-; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
+; X86-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; X86-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
 ; X86-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP6]], [[TMP12]]
+; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
 ; X86-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; X86-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP15]], [[TMP10]]
-; X86-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP15]], [[TMP10]]
-; X86-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; X86-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
-; X86-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
-; X86-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
-; X86-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
-; X86-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
-; X86-NEXT:    ret i1 [[TMP23]]
+; X86-NEXT:    [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP16]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
+; X86-NEXT:    [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]]
+; X86-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float>
+; X86-NEXT:    [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer
+; X86-NEXT:    [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer
+; X86-NEXT:    [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]]
+; X86-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]])
+; X86-NEXT:    ret i1 [[TMP25]]
 ;
 ; AARCH64-LABEL: define i1 @test
 ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; AARCH64-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
-; AARCH64-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
-; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
-; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
-; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
+; AARCH64-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; AARCH64-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
 ; AARCH64-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP6]], [[TMP13]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP10]], [[TMP13]]
 ; AARCH64-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP16:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP15]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP17:%.*]] = fsub <8 x double> [[TMP16]], [[TMP10]]
-; AARCH64-NEXT:    [[TMP18:%.*]] = fmul <8 x double> [[TMP16]], [[TMP10]]
-; AARCH64-NEXT:    [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; AARCH64-NEXT:    [[TMP20:%.*]] = fptrunc <8 x double> [[TMP19]] to <8 x float>
-; AARCH64-NEXT:    [[TMP21:%.*]] = fmul <8 x float> [[TMP20]], zeroinitializer
-; AARCH64-NEXT:    [[TMP22:%.*]] = fcmp oeq <8 x float> [[TMP21]], zeroinitializer
-; AARCH64-NEXT:    [[TMP23:%.*]] = freeze <8 x i1> [[TMP22]]
-; AARCH64-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP23]])
-; AARCH64-NEXT:    ret i1 [[TMP24]]
+; AARCH64-NEXT:    [[TMP17:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
+; AARCH64-NEXT:    [[TMP19:%.*]] = fsub <8 x double> [[TMP16]], [[TMP18]]
+; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x double> [[TMP16]], [[TMP18]]
+; AARCH64-NEXT:    [[TMP21:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP22:%.*]] = fptrunc <8 x double> [[TMP21]] to <8 x float>
+; AARCH64-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], zeroinitializer
+; AARCH64-NEXT:    [[TMP24:%.*]] = fcmp oeq <8 x float> [[TMP23]], zeroinitializer
+; AARCH64-NEXT:    [[TMP25:%.*]] = freeze <8 x i1> [[TMP24]]
+; AARCH64-NEXT:    [[TMP26:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP25]])
+; AARCH64-NEXT:    ret i1 [[TMP26]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index 09e3ef41b3dbe..eefc99feebb95 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -6,34 +6,34 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778)
 ; X86-LABEL: @test(
 ; X86-NEXT:  entry:
 ; X86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
-; X86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
-; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
+; X86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; X86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; X86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; X86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
 ; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
-; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
-; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; X86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
 ; X86-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; X86-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AARCH86-LABEL: @test(
 ; AARCH86-NEXT:  entry:
 ; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; AARCH86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
-; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
-; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
+; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
 ; AARCH86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
-; AARCH86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
-; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
-; AARCH86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
-; AARCH86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
-; AARCH86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; AARCH86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; AARCH86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; AARCH86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
 ; AARCH86-NEXT:    [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;

>From a08cc6e0d5e3fa653649a7826f1ffafc2b3ea2dd Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 26 Dec 2025 06:54:52 -0800
Subject: [PATCH 05/34] Revert "[SLP]Recalculate dependencies for all cleared
 entries"

This reverts commit 2568ec6cb29da3db5bd7c848ec53a673c1431aea to
investigate crashes reported in https://github.com/llvm/llvm-project/commit/2568ec6cb29da3db5bd7c848ec53a673c1431aea#commitcomment-173523022.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  4 +-
 .../X86/non-schedulable-recalculate-deps.ll   | 61 -------------------
 2 files changed, 3 insertions(+), 62 deletions(-)
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-recalculate-deps.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4af20f0e1838b..b78cfca8436d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21269,7 +21269,9 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
           if (ScheduleData *OpSD = getScheduleData(Op);
               OpSD && OpSD->hasValidDependencies()) {
             OpSD->clearDirectDependencies();
-            ControlDependentMembers.push_back(OpSD);
+            if (RegionHasStackSave ||
+                !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
+              ControlDependentMembers.push_back(OpSD);
           }
         }
       }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-recalculate-deps.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-recalculate-deps.ll
deleted file mode 100644
index 566cf106ea47f..0000000000000
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-recalculate-deps.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-
-define void @test(i32 %g, i1 %tobool1.not) {
-; CHECK-LABEL: define void @test(
-; CHECK-SAME: i32 [[G:%.*]], i1 [[TOBOOL1_NOT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
-; CHECK:       [[WHILE_BODY]]:
-; CHECK-NEXT:    [[G_ADDR_0334:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[REASS_SUB35:%.*]], %[[J:.*]] ]
-; CHECK-NEXT:    [[I_ADDR_032:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB14:%.*]], %[[J]] ]
-; CHECK-NEXT:    br i1 [[TOBOOL1_NOT]], label %[[IF_END:.*]], label %[[J]]
-; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[SUB3:%.*]] = add i32 [[G]], 1
-; CHECK-NEXT:    [[REM6:%.*]] = srem i32 1, [[G]]
-; CHECK-NEXT:    [[SUB8:%.*]] = add i32 [[REM6]], 1
-; CHECK-NEXT:    [[REM10:%.*]] = srem i32 1, [[G]]
-; CHECK-NEXT:    [[SUB11:%.*]] = add i32 [[REM10]], 1
-; CHECK-NEXT:    [[SUB12:%.*]] = add i32 [[SUB11]], [[G]]
-; CHECK-NEXT:    br label %[[J]]
-; CHECK:       [[J]]:
-; CHECK-NEXT:    [[I_ADDR_1:%.*]] = phi i32 [ [[I_ADDR_032]], %[[WHILE_BODY]] ], [ [[SUB8]], %[[IF_END]] ]
-; CHECK-NEXT:    [[G_ADDR_1:%.*]] = phi i32 [ [[G_ADDR_0334]], %[[WHILE_BODY]] ], [ [[SUB12]], %[[IF_END]] ]
-; CHECK-NEXT:    [[F_1:%.*]] = phi i32 [ [[G]], %[[WHILE_BODY]] ], [ [[SUB3]], %[[IF_END]] ]
-; CHECK-NEXT:    [[I_ADDR_1_FR11:%.*]] = freeze i32 [[I_ADDR_1]]
-; CHECK-NEXT:    [[DIV:%.*]] = select i1 [[TOBOOL1_NOT]], i32 [[I_ADDR_1_FR11]], i32 0
-; CHECK-NEXT:    [[SUB14]] = or i32 [[DIV]], [[G]]
-; CHECK-NEXT:    [[F_1_FR10:%.*]] = freeze i32 [[F_1]]
-; CHECK-NEXT:    [[DIV16:%.*]] = select i1 [[TOBOOL1_NOT]], i32 [[F_1_FR10]], i32 0
-; CHECK-NEXT:    [[REASS_SUB35]] = or i32 [[DIV16]], [[G_ADDR_1]]
-; CHECK-NEXT:    br label %[[WHILE_BODY]]
-;
-entry:
-  br label %while.body
-
-while.body:
-  %g.addr.0334 = phi i32 [ 0, %entry ], [ %reass.sub35, %j ]
-  %i.addr.032 = phi i32 [ 0, %entry ], [ %sub14, %j ]
-  br i1 %tobool1.not, label %if.end, label %j
-
-if.end:
-  %sub3 = add i32 %g, 1
-  %rem6 = srem i32 1, %g
-  %sub8 = add i32 %rem6, 1
-  %rem10 = srem i32 1, %g
-  %sub11 = add i32 %rem10, 1
-  %sub12 = add i32 %sub11, %g
-  br label %j
-
-j:
-  %i.addr.1 = phi i32 [ %i.addr.032, %while.body ], [ %sub8, %if.end ]
-  %g.addr.1 = phi i32 [ %g.addr.0334, %while.body ], [ %sub12, %if.end ]
-  %f.1 = phi i32 [ %g, %while.body ], [ %sub3, %if.end ]
-  %i.addr.1.fr11 = freeze i32 %i.addr.1
-  %div = select i1 %tobool1.not, i32 %i.addr.1.fr11, i32 0
-  %sub14 = or i32 %div, %g
-  %f.1.fr10 = freeze i32 %f.1
-  %div16 = select i1 %tobool1.not, i32 %f.1.fr10, i32 0
-  %reass.sub35 = or i32 %div16, %g.addr.1
-  br label %while.body
-}

>From 571819cb7931f071c09b4129d8ec130ef63fe8dd Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 24 Dec 2025 09:45:05 -0800
Subject: [PATCH 06/34] [SLP]Recalculate dependencies for all cleared entries

Need to recalculate the dependencies for all cleared items to avoid
a crash, if the entry is used in other vector nodes

Fixes #173469
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 14 +--
 .../X86/non-schedulable-recalculate-deps.ll   | 61 +++++++++++++
 .../X86/non-schedulable-with-copyable-op.ll   | 89 +++++++++++++++++++
 3 files changed, 157 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-recalculate-deps.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-with-copyable-op.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b78cfca8436d8..6fd9759521543 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5905,7 +5905,8 @@ class slpvectorizer::BoUpSLP {
     /// bundles which depend on the original bundle.
     void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
                                BoUpSLP *SLP,
-                               ArrayRef<ScheduleData *> ControlDeps = {});
+                               ArrayRef<ScheduleData *> ControlDeps = {},
+                               bool NonSchedulable = false);
 
     /// Sets all instruction in the scheduling region to un-scheduled.
     void resetSchedule();
@@ -21269,9 +21270,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
           if (ScheduleData *OpSD = getScheduleData(Op);
               OpSD && OpSD->hasValidDependencies()) {
             OpSD->clearDirectDependencies();
-            if (RegionHasStackSave ||
-                !isGuaranteedToTransferExecutionToSuccessor(OpSD->getInst()))
-              ControlDependentMembers.push_back(OpSD);
+            ControlDependentMembers.push_back(OpSD);
           }
         }
       }
@@ -21279,7 +21278,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     if (!ControlDependentMembers.empty()) {
       ScheduleBundle Invalid = ScheduleBundle::invalid();
       calculateDependencies(Invalid, /*InsertInReadyList=*/true, SLP,
-                            ControlDependentMembers);
+                            ControlDependentMembers, /*NonSchedulable=*/true);
     }
     return nullptr;
   }
@@ -21663,7 +21662,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
 
 void BoUpSLP::BlockScheduling::calculateDependencies(
     ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
-    ArrayRef<ScheduleData *> ControlDeps) {
+    ArrayRef<ScheduleData *> ControlDeps, bool NonSchedulable) {
   SmallVector<ScheduleEntity *> WorkList;
   auto ProcessNode = [&](ScheduleEntity *SE) {
     if (auto *CD = dyn_cast<ScheduleCopyableData>(SE)) {
@@ -21748,7 +21747,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
         // The operand is a copyable element - skip.
         unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond();
         ++NumOps;
-        if (areAllOperandsReplacedByCopyableData(
+        if (!NonSchedulable &&
+            areAllOperandsReplacedByCopyableData(
                 cast<Instruction>(U), BundleMember->getInst(), *SLP, NumOps))
           continue;
         BundleMember->incDependencies();
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-recalculate-deps.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-recalculate-deps.ll
new file mode 100644
index 0000000000000..566cf106ea47f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-recalculate-deps.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(i32 %g, i1 %tobool1.not) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[G:%.*]], i1 [[TOBOOL1_NOT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:    [[G_ADDR_0334:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[REASS_SUB35:%.*]], %[[J:.*]] ]
+; CHECK-NEXT:    [[I_ADDR_032:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB14:%.*]], %[[J]] ]
+; CHECK-NEXT:    br i1 [[TOBOOL1_NOT]], label %[[IF_END:.*]], label %[[J]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[SUB3:%.*]] = add i32 [[G]], 1
+; CHECK-NEXT:    [[REM6:%.*]] = srem i32 1, [[G]]
+; CHECK-NEXT:    [[SUB8:%.*]] = add i32 [[REM6]], 1
+; CHECK-NEXT:    [[REM10:%.*]] = srem i32 1, [[G]]
+; CHECK-NEXT:    [[SUB11:%.*]] = add i32 [[REM10]], 1
+; CHECK-NEXT:    [[SUB12:%.*]] = add i32 [[SUB11]], [[G]]
+; CHECK-NEXT:    br label %[[J]]
+; CHECK:       [[J]]:
+; CHECK-NEXT:    [[I_ADDR_1:%.*]] = phi i32 [ [[I_ADDR_032]], %[[WHILE_BODY]] ], [ [[SUB8]], %[[IF_END]] ]
+; CHECK-NEXT:    [[G_ADDR_1:%.*]] = phi i32 [ [[G_ADDR_0334]], %[[WHILE_BODY]] ], [ [[SUB12]], %[[IF_END]] ]
+; CHECK-NEXT:    [[F_1:%.*]] = phi i32 [ [[G]], %[[WHILE_BODY]] ], [ [[SUB3]], %[[IF_END]] ]
+; CHECK-NEXT:    [[I_ADDR_1_FR11:%.*]] = freeze i32 [[I_ADDR_1]]
+; CHECK-NEXT:    [[DIV:%.*]] = select i1 [[TOBOOL1_NOT]], i32 [[I_ADDR_1_FR11]], i32 0
+; CHECK-NEXT:    [[SUB14]] = or i32 [[DIV]], [[G]]
+; CHECK-NEXT:    [[F_1_FR10:%.*]] = freeze i32 [[F_1]]
+; CHECK-NEXT:    [[DIV16:%.*]] = select i1 [[TOBOOL1_NOT]], i32 [[F_1_FR10]], i32 0
+; CHECK-NEXT:    [[REASS_SUB35]] = or i32 [[DIV16]], [[G_ADDR_1]]
+; CHECK-NEXT:    br label %[[WHILE_BODY]]
+;
+entry:
+  br label %while.body
+
+while.body:
+  %g.addr.0334 = phi i32 [ 0, %entry ], [ %reass.sub35, %j ]
+  %i.addr.032 = phi i32 [ 0, %entry ], [ %sub14, %j ]
+  br i1 %tobool1.not, label %if.end, label %j
+
+if.end:
+  %sub3 = add i32 %g, 1
+  %rem6 = srem i32 1, %g
+  %sub8 = add i32 %rem6, 1
+  %rem10 = srem i32 1, %g
+  %sub11 = add i32 %rem10, 1
+  %sub12 = add i32 %sub11, %g
+  br label %j
+
+j:
+  %i.addr.1 = phi i32 [ %i.addr.032, %while.body ], [ %sub8, %if.end ]
+  %g.addr.1 = phi i32 [ %g.addr.0334, %while.body ], [ %sub12, %if.end ]
+  %f.1 = phi i32 [ %g, %while.body ], [ %sub3, %if.end ]
+  %i.addr.1.fr11 = freeze i32 %i.addr.1
+  %div = select i1 %tobool1.not, i32 %i.addr.1.fr11, i32 0
+  %sub14 = or i32 %div, %g
+  %f.1.fr10 = freeze i32 %f.1
+  %div16 = select i1 %tobool1.not, i32 %f.1.fr10, i32 0
+  %reass.sub35 = or i32 %div16, %g.addr.1
+  br label %while.body
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-with-copyable-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-with-copyable-op.ll
new file mode 100644
index 0000000000000..688767713b571
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-with-copyable-op.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(i32 %xor4.i, i32 %xor18.i, i1 %tobool.not.i) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i32 [[XOR4_I:%.*]], i32 [[XOR18_I:%.*]], i1 [[TOBOOL_NOT_I:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[BB:.*]]
+; CHECK:       [[BB]]:
+; CHECK-NEXT:    [[XOR375678_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[XOR37_I:%.*]], %[[BB]] ]
+; CHECK-NEXT:    [[XOR385777_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[XOR35_I:%.*]], %[[BB]] ]
+; CHECK-NEXT:    [[XOR445876_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[AND43_I:%.*]], %[[BB]] ]
+; CHECK-NEXT:    [[XOR505975_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[XOR48_I:%.*]], %[[BB]] ]
+; CHECK-NEXT:    [[XOR316272_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[XOR31_I:%.*]], %[[BB]] ]
+; CHECK-NEXT:    [[XOR536470_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[XOR53_I:%.*]], %[[BB]] ]
+; CHECK-NEXT:    [[XOR30_I:%.*]] = tail call i32 @llvm.fshl.i32(i32 [[XOR4_I]], i32 1, i32 1)
+; CHECK-NEXT:    [[XOR31_I]] = xor i32 [[XOR30_I]], [[XOR4_I]]
+; CHECK-NEXT:    [[XOR11_I:%.*]] = tail call i32 @llvm.fshl.i32(i32 [[XOR4_I]], i32 [[XOR375678_I]], i32 1)
+; CHECK-NEXT:    [[XOR7_I:%.*]] = tail call i32 @llvm.fshl.i32(i32 [[XOR4_I]], i32 [[XOR316272_I]], i32 1)
+; CHECK-NEXT:    [[AND22_I:%.*]] = and i32 [[XOR11_I]], [[XOR7_I]]
+; CHECK-NEXT:    [[XOR23_I:%.*]] = xor i32 [[XOR18_I]], [[AND22_I]]
+; CHECK-NEXT:    [[XOR32_I:%.*]] = xor i32 [[XOR23_I]], 1
+; CHECK-NEXT:    [[XOR35_I]] = tail call i32 @llvm.fshl.i32(i32 [[XOR32_I]], i32 [[XOR18_I]], i32 1)
+; CHECK-NEXT:    [[AND36_I:%.*]] = and i32 [[XOR35_I]], 1
+; CHECK-NEXT:    [[XOR37_I]] = xor i32 [[AND36_I]], 1
+; CHECK-NEXT:    [[XOR14_I:%.*]] = tail call i32 @llvm.fshl.i32(i32 1, i32 [[XOR536470_I]], i32 1)
+; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[XOR14_I]], [[XOR11_I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[XOR4_I]], [[AND_I]]
+; CHECK-NEXT:    [[XOR39_I:%.*]] = xor i32 [[TMP0]], 1
+; CHECK-NEXT:    [[XOR42_I:%.*]] = tail call i32 @llvm.fshl.i32(i32 [[XOR39_I]], i32 [[XOR23_I]], i32 1)
+; CHECK-NEXT:    [[AND43_I]] = and i32 [[XOR42_I]], [[XOR4_I]]
+; CHECK-NEXT:    [[XOR15_I:%.*]] = xor i32 [[XOR536470_I]], [[XOR4_I]]
+; CHECK-NEXT:    [[XOR18_I3:%.*]] = tail call i32 @llvm.fshl.i32(i32 1, i32 [[XOR15_I]], i32 1)
+; CHECK-NEXT:    [[AND25_I:%.*]] = and i32 [[XOR7_I]], [[XOR18_I3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[AND25_I]], [[XOR4_I]]
+; CHECK-NEXT:    [[XOR45_I:%.*]] = xor i32 [[TMP1]], 1
+; CHECK-NEXT:    [[XOR48_I]] = tail call i32 @llvm.fshl.i32(i32 [[XOR45_I]], i32 1, i32 1)
+; CHECK-NEXT:    [[XOR53_I]] = xor i32 [[XOR4_I]], 1
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT_I]], label %[[EXIT:.*]], label %[[BB]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[XOR385777_I]], [[XOR445876_I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], [[XOR505975_I]]
+; CHECK-NEXT:    [[XOR2_I:%.*]] = xor i32 [[TMP3]], [[XOR375678_I]]
+; CHECK-NEXT:    ret i32 [[XOR2_I]]
+;
+entry:
+  br label %bb
+
+bb:
+  %xor375678.i = phi i32 [ 0, %entry ], [ %xor37.i, %bb ]
+  %xor385777.i = phi i32 [ 0, %entry ], [ %xor35.i, %bb ]
+  %xor445876.i = phi i32 [ 0, %entry ], [ %and43.i, %bb ]
+  %xor505975.i = phi i32 [ 0, %entry ], [ %xor48.i, %bb ]
+  %xor316272.i = phi i32 [ 0, %entry ], [ %xor31.i, %bb ]
+  %xor536470.i = phi i32 [ 0, %entry ], [ %xor53.i, %bb ]
+  %xor30.i = tail call i32 @llvm.fshl.i32(i32 %xor4.i, i32 1, i32 1)
+  %xor31.i = xor i32 %xor30.i, %xor4.i
+  %xor11.i = tail call i32 @llvm.fshl.i32(i32 %xor4.i, i32 %xor375678.i, i32 1)
+  %xor7.i = tail call i32 @llvm.fshl.i32(i32 %xor4.i, i32 %xor316272.i, i32 1)
+  %and22.i = and i32 %xor11.i, %xor7.i
+  %xor23.i = xor i32 %xor18.i, %and22.i
+  %xor32.i = xor i32 %xor23.i, 1
+  %xor35.i = tail call i32 @llvm.fshl.i32(i32 %xor32.i, i32 %xor18.i, i32 1)
+  %and36.i = and i32 %xor35.i, 1
+  %xor37.i = xor i32 %and36.i, 1
+  %xor14.i = tail call i32 @llvm.fshl.i32(i32 1, i32 %xor536470.i, i32 1)
+  %and.i = and i32 %xor14.i, %xor11.i
+  %0 = xor i32 %xor4.i, %and.i
+  %xor39.i = xor i32 %0, 1
+  %xor42.i = tail call i32 @llvm.fshl.i32(i32 %xor39.i, i32 %xor23.i, i32 1)
+  %and43.i = and i32 %xor42.i, %xor4.i
+  %xor15.i = xor i32 %xor536470.i, %xor4.i
+  %xor18.i3 = tail call i32 @llvm.fshl.i32(i32 1, i32 %xor15.i, i32 1)
+  %and25.i = and i32 %xor7.i, %xor18.i3
+  %1 = xor i32 %and25.i, %xor4.i
+  %xor45.i = xor i32 %1, 1
+  %xor48.i = tail call i32 @llvm.fshl.i32(i32 %xor45.i, i32 1, i32 1)
+  %xor53.i = xor i32 %xor4.i, 1
+  br i1 %tobool.not.i, label %exit, label %bb
+
+exit:
+  %2 = xor i32 %xor385777.i, %xor445876.i
+  %3 = xor i32 %2, %xor505975.i
+  %xor2.i = xor i32 %3, %xor375678.i
+  ret i32 %xor2.i
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+

>From 60189b351381c80938699b9958e6e07f69e65ce1 Mon Sep 17 00:00:00 2001
From: Sergei Druzhkov <serzhdruzhok at gmail.com>
Date: Fri, 26 Dec 2025 22:43:38 +0300
Subject: [PATCH 07/34] [lldb-dap] Migrate stackTrace request to structured
 types (#173226)

This patch finishes migration to structured types and removes
`LegacyRequestHandler`.
---
 lldb/tools/lldb-dap/Handler/RequestHandler.h  |  19 +-
 .../Handler/StackTraceRequestHandler.cpp      | 255 +++++++++---------
 lldb/tools/lldb-dap/JSONUtils.cpp             | 158 -----------
 lldb/tools/lldb-dap/JSONUtils.h               |  50 ----
 lldb/tools/lldb-dap/LLDBUtils.cpp             |   5 +-
 lldb/tools/lldb-dap/LLDBUtils.h               |   2 +-
 .../lldb-dap/Protocol/ProtocolRequests.cpp    |  18 ++
 .../lldb-dap/Protocol/ProtocolRequests.h      |  38 +++
 .../tools/lldb-dap/Protocol/ProtocolTypes.cpp |  56 ++++
 lldb/tools/lldb-dap/Protocol/ProtocolTypes.h  |  89 ++++++
 lldb/unittests/DAP/ProtocolRequestsTest.cpp   |  68 +++++
 lldb/unittests/DAP/ProtocolTypesTest.cpp      |  86 ++++++
 12 files changed, 491 insertions(+), 353 deletions(-)

diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index ff0ff4decf0f0..1e3d66ed3f795 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -93,16 +93,6 @@ class BaseRequestHandler {
   DAP &dap;
 };
 
-/// FIXME: Migrate callers to typed RequestHandler for improved type handling.
-class LegacyRequestHandler : public BaseRequestHandler {
-  using BaseRequestHandler::BaseRequestHandler;
-  virtual void operator()(const llvm::json::Object &request) const = 0;
-  void operator()(const protocol::Request &request) const override {
-    auto req = toJSON(request);
-    (*this)(*req.getAsObject());
-  }
-};
-
 template <typename Args>
 llvm::Expected<Args> parseArgs(const protocol::Request &request) {
   if (!is_optional_v<Args> && !request.arguments)
@@ -543,11 +533,14 @@ class SourceRequestHandler final
   Run(const protocol::SourceArguments &args) const override;
 };
 
-class StackTraceRequestHandler : public LegacyRequestHandler {
+class StackTraceRequestHandler
+    : public RequestHandler<protocol::StackTraceArguments,
+                            llvm::Expected<protocol::StackTraceResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "stackTrace"; }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::StackTraceResponseBody>
+  Run(const protocol::StackTraceArguments &args) const override;
   FeatureSet GetSupportedFeatures() const override {
     return {protocol::eAdapterFeatureDelayedStackTraceLoading};
   }
diff --git a/lldb/tools/lldb-dap/Handler/StackTraceRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/StackTraceRequestHandler.cpp
index 77ef952a1e343..7064d356a6479 100644
--- a/lldb/tools/lldb-dap/Handler/StackTraceRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/StackTraceRequestHandler.cpp
@@ -7,14 +7,108 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
+#include "DAPError.h"
 #include "EventHelper.h"
-#include "JSONUtils.h"
+#include "LLDBUtils.h"
+#include "Protocol/ProtocolRequests.h"
+#include "ProtocolUtils.h"
 #include "RequestHandler.h"
+#include "lldb/API/SBStream.h"
 
-namespace lldb_dap {
+using namespace lldb_dap;
+using namespace lldb_dap::protocol;
 
-/// Page size used for reporting addtional frames in the 'stackTrace' request.
-static constexpr int StackPageSize = 20;
+/// Page size used for reporting additional frames in the 'stackTrace' request.
+static constexpr int k_stack_page_size = 20;
+
+// Create a "StackFrame" object for a LLDB frame object.
+static StackFrame CreateStackFrame(DAP &dap, lldb::SBFrame &frame,
+                                   lldb::SBFormat &format) {
+  StackFrame stack_frame;
+  stack_frame.id = MakeDAPFrameID(frame);
+
+  lldb::SBStream stream;
+  if (format && frame.GetDescriptionWithFormat(format, stream).Success()) {
+    stack_frame.name = llvm::StringRef(stream.GetData(), stream.GetSize());
+
+    // `function_name` can be a nullptr, which throws an error when assigned to
+    // an `std::string`.
+  } else if (llvm::StringRef name = frame.GetDisplayFunctionName();
+             !name.empty()) {
+    stack_frame.name = name;
+  }
+
+  if (stack_frame.name.empty()) {
+    // If the function name is unavailable, display the pc address as a 16-digit
+    // hex string, e.g. "0x0000000000012345"
+    stack_frame.name = GetLoadAddressString(frame.GetPC());
+  }
+
+  // We only include `[opt]` if a custom frame format is not specified.
+  if (!format && frame.GetFunction().GetIsOptimized())
+    stack_frame.name += " [opt]";
+
+  std::optional<protocol::Source> source = dap.ResolveSource(frame);
+  if (source && !IsAssemblySource(*source)) {
+    // This is a normal source with a valid line entry.
+    auto line_entry = frame.GetLineEntry();
+    stack_frame.line = line_entry.GetLine();
+    stack_frame.column = line_entry.GetColumn();
+  } else if (frame.GetSymbol().IsValid()) {
+    // This is a source where the disassembly is used, but there is a valid
+    // symbol. Calculate the line of the current PC from the start of the
+    // current symbol.
+    lldb::SBInstructionList inst_list = dap.target.ReadInstructions(
+        frame.GetSymbol().GetStartAddress(), frame.GetPCAddress(), nullptr);
+    size_t inst_line = inst_list.GetSize();
+
+    // Line numbers are 1-based.
+    stack_frame.line = inst_line + 1;
+    stack_frame.column = 1;
+  } else {
+    // No valid line entry or symbol.
+    stack_frame.line = 0;
+    stack_frame.column = 0;
+  }
+
+  stack_frame.source = std::move(source);
+  stack_frame.instructionPointerReference = frame.GetPC();
+
+  if (frame.IsArtificial() || frame.IsHidden())
+    stack_frame.presentationHint = StackFrame::ePresentationHintSubtle;
+  if (const lldb::SBModule module = frame.GetModule()) {
+    if (llvm::StringRef uuid = module.GetUUIDString(); !uuid.empty())
+      stack_frame.moduleId = uuid.str();
+  }
+
+  return stack_frame;
+}
+
+// Create a "StackFrame" label object for a LLDB thread.
+static StackFrame CreateExtendedStackFrameLabel(lldb::SBThread &thread,
+                                                lldb::SBFormat &format) {
+  StackFrame stack_frame;
+  lldb::SBStream stream;
+  if (format && thread.GetDescriptionWithFormat(format, stream).Success()) {
+    stack_frame.name = llvm::StringRef(stream.GetData(), stream.GetSize());
+  } else {
+    const uint32_t thread_idx = thread.GetExtendedBacktraceOriginatingIndexID();
+    if (llvm::StringRef queue_name = thread.GetQueueName();
+        !queue_name.empty()) {
+      stack_frame.name = llvm::formatv("Enqueued from {0} (Thread {1})",
+                                       queue_name, thread_idx);
+    } else {
+      stack_frame.name = llvm::formatv("Thread {0}", thread_idx);
+    }
+  }
+
+  stack_frame.id = thread.GetThreadID() + 1;
+  stack_frame.presentationHint = StackFrame::ePresentationHintLabel;
+  stack_frame.line = 0;
+  stack_frame.column = 0;
+
+  return stack_frame;
+}
 
 // Fill in the stack frames of the thread.
 //
@@ -50,9 +144,9 @@ static constexpr int StackPageSize = 20;
 // s=3,l=3 = [th0->s3, label1, th1->s0]
 static bool FillStackFrames(DAP &dap, lldb::SBThread &thread,
                             lldb::SBFormat &frame_format,
-                            llvm::json::Array &stack_frames, int64_t &offset,
-                            const int64_t start_frame, const int64_t levels,
-                            const bool include_all) {
+                            std::vector<StackFrame> &stack_frames,
+                            int64_t &offset, const int64_t start_frame,
+                            const int64_t levels, const bool include_all) {
   bool reached_end_of_stack = false;
   for (int64_t i = start_frame;
        static_cast<int64_t>(stack_frames.size()) < levels; i++) {
@@ -93,150 +187,53 @@ static bool FillStackFrames(DAP &dap, lldb::SBThread &thread,
   return reached_end_of_stack;
 }
 
-// "StackTraceRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "StackTrace request; value of command field is
-//     'stackTrace'. The request returns a stacktrace from the current execution
-//     state.", "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "stackTrace" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/StackTraceArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments"  ]
-//   }]
-// },
-// "StackTraceArguments": {
-//   "type": "object",
-//   "description": "Arguments for 'stackTrace' request.",
-//   "properties": {
-//     "threadId": {
-//       "type": "integer",
-//       "description": "Retrieve the stacktrace for this thread."
-//     },
-//     "startFrame": {
-//       "type": "integer",
-//       "description": "The index of the first frame to return; if omitted
-//       frames start at 0."
-//     },
-//     "levels": {
-//       "type": "integer",
-//       "description": "The maximum number of frames to return. If levels is
-//       not specified or 0, all frames are returned."
-//     },
-//     "format": {
-//       "$ref": "#/definitions/StackFrameFormat",
-//       "description": "Specifies details on how to format the stack frames.
-//       The attribute is only honored by a debug adapter if the corresponding
-//       capability `supportsValueFormattingOptions` is true."
-//     }
-//  },
-//   "required": [ "threadId" ]
-// },
-// "StackTraceResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to `stackTrace` request.",
-//     "properties": {
-//       "body": {
-//         "type": "object",
-//         "properties": {
-//           "stackFrames": {
-//             "type": "array",
-//             "items": {
-//               "$ref": "#/definitions/StackFrame"
-//             },
-//             "description": "The frames of the stackframe. If the array has
-//             length zero, there are no stackframes available. This means that
-//             there is no location information available."
-//           },
-//           "totalFrames": {
-//             "type": "integer",
-//             "description": "The total number of frames available in the
-//             stack. If omitted or if `totalFrames` is larger than the
-//             available frames, a client is expected to request frames until
-//             a request returns less frames than requested (which indicates
-//             the end of the stack). Returning monotonically increasing
-//             `totalFrames` values for subsequent requests can be used to
-//             enforce paging in the client."
-//           }
-//         },
-//         "required": [ "stackFrames" ]
-//       }
-//     },
-//     "required": [ "body" ]
-//   }]
-// }
-void StackTraceRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  lldb::SBError error;
-  const auto *arguments = request.getObject("arguments");
-  lldb::SBThread thread = dap.GetLLDBThread(*arguments);
-  llvm::json::Array stack_frames;
-  llvm::json::Object body;
+llvm::Expected<protocol::StackTraceResponseBody>
+StackTraceRequestHandler::Run(const protocol::StackTraceArguments &args) const {
+  lldb::SBThread thread = dap.GetLLDBThread(args.threadId);
+  if (!thread.IsValid())
+    return llvm::make_error<DAPError>("invalid thread");
 
   lldb::SBFormat frame_format = dap.frame_format;
   bool include_all = dap.configuration.displayExtendedBacktrace;
 
-  if (const auto *format = arguments->getObject("format")) {
-    // Indicates that all stack frames should be included, even those the debug
-    // adapter might otherwise hide.
-    include_all = GetBoolean(format, "includeAll").value_or(false);
+  if (args.format) {
+    const StackFrameFormat &format = *args.format;
 
-    // Parse the properties that have a corresponding format string.
-    // FIXME: Support "parameterTypes" and "hex".
-    const bool module = GetBoolean(format, "module").value_or(false);
-    const bool line = GetBoolean(format, "line").value_or(false);
-    const bool parameters = GetBoolean(format, "parameters").value_or(false);
-    const bool parameter_names =
-        GetBoolean(format, "parameterNames").value_or(false);
-    const bool parameter_values =
-        GetBoolean(format, "parameterValues").value_or(false);
+    include_all = format.includeAll;
 
+    // FIXME: Support "parameterTypes" and "hex".
     // Only change the format string if we have to.
-    if (module || line || parameters || parameter_names || parameter_values) {
+    if (format.module || format.line || format.parameters ||
+        format.parameterNames || format.parameterValues) {
       std::string format_str;
       llvm::raw_string_ostream os(format_str);
 
-      if (module)
+      if (format.module)
         os << "{${module.file.basename} }";
 
-      if (line)
+      if (format.line)
         os << "{${line.file.basename}:${line.number}:${line.column} }";
 
-      if (parameters || parameter_names || parameter_values)
+      if (format.parameters || format.parameterNames || format.parameterValues)
         os << "{${function.name-with-args}}";
       else
         os << "{${function.name-without-args}}";
 
       lldb::SBError error;
       frame_format = lldb::SBFormat(format_str.c_str(), error);
-      assert(error.Success());
+      if (error.Fail())
+        return ToError(error);
     }
   }
 
-  if (thread.IsValid()) {
-    const auto start_frame =
-        GetInteger<uint64_t>(arguments, "startFrame").value_or(0);
-    const auto levels = GetInteger<uint64_t>(arguments, "levels").value_or(0);
-    int64_t offset = 0;
-    bool reached_end_of_stack = FillStackFrames(
-        dap, thread, frame_format, stack_frames, offset, start_frame,
-        levels == 0 ? INT64_MAX : levels, include_all);
-    body.try_emplace("totalFrames",
-                     start_frame + stack_frames.size() +
-                         (reached_end_of_stack ? 0 : StackPageSize));
-  }
+  StackTraceResponseBody body;
+  const auto levels = args.levels == 0 ? INT64_MAX : args.levels;
+  int64_t offset = 0;
+  bool reached_end_of_stack =
+      FillStackFrames(dap, thread, frame_format, body.stackFrames, offset,
+                      args.startFrame, levels, include_all);
+  body.totalFrames = args.startFrame + body.stackFrames.size() +
+                     (reached_end_of_stack ? 0 : k_stack_page_size);
 
-  body.try_emplace("stackFrames", std::move(stack_frames));
-  response.try_emplace("body", std::move(body));
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+  return body;
 }
-
-} // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 86d1b18b933bc..9a2142cd847ab 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -430,164 +430,6 @@ llvm::json::Object CreateEventObject(const llvm::StringRef event_name) {
   return event;
 }
 
-// "StackFrame": {
-//   "type": "object",
-//   "description": "A Stackframe contains the source location.",
-//   "properties": {
-//     "id": {
-//       "type": "integer",
-//       "description": "An identifier for the stack frame. It must be unique
-//                       across all threads. This id can be used to retrieve
-//                       the scopes of the frame with the 'scopesRequest' or
-//                       to restart the execution of a stackframe."
-//     },
-//     "name": {
-//       "type": "string",
-//       "description": "The name of the stack frame, typically a method name."
-//     },
-//     "source": {
-//       "$ref": "#/definitions/Source",
-//       "description": "The optional source of the frame."
-//     },
-//     "line": {
-//       "type": "integer",
-//       "description": "The line within the file of the frame. If source is
-//                       null or doesn't exist, line is 0 and must be ignored."
-//     },
-//     "column": {
-//       "type": "integer",
-//       "description": "The column within the line. If source is null or
-//                       doesn't exist, column is 0 and must be ignored."
-//     },
-//     "endLine": {
-//       "type": "integer",
-//       "description": "An optional end line of the range covered by the
-//                       stack frame."
-//     },
-//     "endColumn": {
-//       "type": "integer",
-//       "description": "An optional end column of the range covered by the
-//                       stack frame."
-//     },
-//     "instructionPointerReference": {
-// 	     "type": "string",
-// 	     "description": "A memory reference for the current instruction
-//                         pointer in this frame."
-//     },
-//     "moduleId": {
-//       "type": ["integer", "string"],
-//       "description": "The module associated with this frame, if any."
-//     },
-//     "presentationHint": {
-//       "type": "string",
-//       "enum": [ "normal", "label", "subtle" ],
-//       "description": "An optional hint for how to present this frame in
-//                       the UI. A value of 'label' can be used to indicate
-//                       that the frame is an artificial frame that is used
-//                       as a visual label or separator. A value of 'subtle'
-//                       can be used to change the appearance of a frame in
-//                       a 'subtle' way."
-//     }
-//   },
-//   "required": [ "id", "name", "line", "column" ]
-// }
-llvm::json::Value CreateStackFrame(DAP &dap, lldb::SBFrame &frame,
-                                   lldb::SBFormat &format) {
-  llvm::json::Object object;
-  int64_t frame_id = MakeDAPFrameID(frame);
-  object.try_emplace("id", frame_id);
-
-  std::string frame_name;
-  lldb::SBStream stream;
-  if (format && frame.GetDescriptionWithFormat(format, stream).Success()) {
-    frame_name = stream.GetData();
-
-    // `function_name` can be a nullptr, which throws an error when assigned to
-    // an `std::string`.
-  } else if (const char *name = frame.GetDisplayFunctionName()) {
-    frame_name = name;
-  }
-
-  if (frame_name.empty()) {
-    // If the function name is unavailable, display the pc address as a 16-digit
-    // hex string, e.g. "0x0000000000012345"
-    frame_name = GetLoadAddressString(frame.GetPC());
-  }
-
-  // We only include `[opt]` if a custom frame format is not specified.
-  if (!format && frame.GetFunction().GetIsOptimized())
-    frame_name += " [opt]";
-
-  EmplaceSafeString(object, "name", frame_name);
-
-  std::optional<protocol::Source> source = dap.ResolveSource(frame);
-
-  if (source && !IsAssemblySource(*source)) {
-    // This is a normal source with a valid line entry.
-    auto line_entry = frame.GetLineEntry();
-    object.try_emplace("line", line_entry.GetLine());
-    auto column = line_entry.GetColumn();
-    object.try_emplace("column", column);
-  } else if (frame.GetSymbol().IsValid()) {
-    // This is a source where the disassembly is used, but there is a valid
-    // symbol. Calculate the line of the current PC from the start of the
-    // current symbol.
-    lldb::SBInstructionList inst_list = dap.target.ReadInstructions(
-        frame.GetSymbol().GetStartAddress(), frame.GetPCAddress(), nullptr);
-    size_t inst_line = inst_list.GetSize();
-
-    // Line numbers are 1-based.
-    object.try_emplace("line", inst_line + 1);
-    object.try_emplace("column", 1);
-  } else {
-    // No valid line entry or symbol.
-    object.try_emplace("line", 1);
-    object.try_emplace("column", 1);
-  }
-
-  if (source)
-    object.try_emplace("source", std::move(source).value());
-
-  const auto pc = frame.GetPC();
-  if (pc != LLDB_INVALID_ADDRESS) {
-    std::string formatted_addr = "0x" + llvm::utohexstr(pc);
-    object.try_emplace("instructionPointerReference", formatted_addr);
-  }
-
-  if (frame.IsArtificial() || frame.IsHidden())
-    object.try_emplace("presentationHint", "subtle");
-
-  lldb::SBModule module = frame.GetModule();
-  if (module.IsValid()) {
-    if (const llvm::StringRef uuid = module.GetUUIDString(); !uuid.empty())
-      object.try_emplace("moduleId", uuid.str());
-  }
-
-  return llvm::json::Value(std::move(object));
-}
-
-llvm::json::Value CreateExtendedStackFrameLabel(lldb::SBThread &thread,
-                                                lldb::SBFormat &format) {
-  std::string name;
-  lldb::SBStream stream;
-  if (format && thread.GetDescriptionWithFormat(format, stream).Success()) {
-    name = stream.GetData();
-  } else {
-    const uint32_t thread_idx = thread.GetExtendedBacktraceOriginatingIndexID();
-    const char *queue_name = thread.GetQueueName();
-    if (queue_name != nullptr) {
-      name = llvm::formatv("Enqueued from {0} (Thread {1})", queue_name,
-                           thread_idx);
-    } else {
-      name = llvm::formatv("Thread {0}", thread_idx);
-    }
-  }
-
-  return llvm::json::Value(llvm::json::Object{{"id", thread.GetThreadID() + 1},
-                                              {"name", name},
-                                              {"presentationHint", "label"}});
-}
-
 // "StoppedEvent": {
 //   "allOf": [ { "$ref": "#/definitions/Event" }, {
 //     "type": "object",
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 1e38de6f5b80d..15449d6ece62a 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -234,56 +234,6 @@ void FillResponse(const llvm::json::Object &request,
 ///     definition outlined by Microsoft.
 llvm::json::Object CreateEventObject(const llvm::StringRef event_name);
 
-/// Create a "StackFrame" object for a LLDB frame object.
-///
-/// This function will fill in the following keys in the returned
-/// object:
-///   "id" - the stack frame ID as an integer
-///   "name" - the function name as a string
-///   "source" - source file information as a "Source" DAP object
-///   "line" - the source file line number as an integer
-///   "column" - the source file column number as an integer
-///
-/// \param[in] dap
-///     The DAP session associated with the stopped thread.
-///
-/// \param[in] frame
-///     The LLDB stack frame to use when populating out the "StackFrame"
-///     object.
-///
-/// \param[in] format
-///     The LLDB format to use when populating out the "StackFrame"
-///     object.
-///
-/// \return
-///     A "StackFrame" JSON object with that follows the formal JSON
-///     definition outlined by Microsoft.
-llvm::json::Value CreateStackFrame(DAP &dap, lldb::SBFrame &frame,
-                                   lldb::SBFormat &format);
-
-/// Create a "StackFrame" label object for a LLDB thread.
-///
-/// This function will fill in the following keys in the returned
-/// object:
-///   "id" - the thread ID as an integer
-///   "name" - the thread name as a string which combines the LLDB
-///            thread index ID along with the string name of the thread
-///            from the OS if it has a name.
-///   "presentationHint" - "label"
-///
-/// \param[in] thread
-///     The LLDB thread to use when populating out the "Thread"
-///     object.
-///
-/// \param[in] format
-///     The configured formatter for the DAP session.
-///
-/// \return
-///     A "StackFrame" JSON object with that follows the formal JSON
-///     definition outlined by Microsoft.
-llvm::json::Value CreateExtendedStackFrameLabel(lldb::SBThread &thread,
-                                                lldb::SBFormat &format);
-
 /// Create a "StoppedEvent" object for a LLDB thread object.
 ///
 /// This function will fill in the following keys in the returned
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index e2ba2ee64103d..22e4012b238ac 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <cstdint>
 #include <cstring>
 #include <mutex>
 #include <system_error>
@@ -159,8 +160,8 @@ uint32_t GetLLDBFrameID(uint64_t dap_frame_id) {
   return dap_frame_id & ((1u << THREAD_INDEX_SHIFT) - 1);
 }
 
-int64_t MakeDAPFrameID(lldb::SBFrame &frame) {
-  return ((int64_t)frame.GetThread().GetIndexID() << THREAD_INDEX_SHIFT) |
+uint64_t MakeDAPFrameID(lldb::SBFrame &frame) {
+  return ((uint64_t)frame.GetThread().GetIndexID() << THREAD_INDEX_SHIFT) |
          frame.GetFrameID();
 }
 
diff --git a/lldb/tools/lldb-dap/LLDBUtils.h b/lldb/tools/lldb-dap/LLDBUtils.h
index a29d3d88789a0..9545654504e8d 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.h
+++ b/lldb/tools/lldb-dap/LLDBUtils.h
@@ -125,7 +125,7 @@ bool ThreadHasStopReason(lldb::SBThread &thread);
 /// \return
 ///     A unique integer that allows us to easily find the right
 ///     stack frame within a thread on subsequent VS code requests.
-int64_t MakeDAPFrameID(lldb::SBFrame &frame);
+uint64_t MakeDAPFrameID(lldb::SBFrame &frame);
 
 /// Given a DAP frame ID, convert to a LLDB thread index id.
 ///
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index c87111d8f1b78..c3225f6ba0e35 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -802,4 +802,22 @@ bool fromJSON(const llvm::json::Value &Params, RestartArguments &Args,
   return false;
 }
 
+bool fromJSON(const llvm::json::Value &Params, StackTraceArguments &Args,
+              llvm::json::Path Path) {
+  json::ObjectMapper O(Params, Path);
+  return O && O.map("threadId", Args.threadId) &&
+         O.mapOptional("startFrame", Args.startFrame) &&
+         O.mapOptional("levels", Args.levels) &&
+         O.mapOptional("format", Args.format);
+}
+
+llvm::json::Value toJSON(const StackTraceResponseBody &Body) {
+  json::Object result{{"stackFrames", Body.stackFrames}};
+
+  if (Body.totalFrames)
+    result.insert({"totalFrames", Body.totalFrames});
+
+  return result;
+}
+
 } // namespace lldb_dap::protocol
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 33fcaae1710b5..9a99af9068ef1 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -1268,6 +1268,44 @@ bool fromJSON(const llvm::json::Value &, RestartArguments &, llvm::json::Path);
 /// field is required.
 using RestartResponse = VoidResponse;
 
+/// Arguments for `stackTrace` request.
+struct StackTraceArguments {
+  /// Retrieve the stacktrace for this thread.
+  lldb::tid_t threadId = LLDB_INVALID_THREAD_ID;
+
+  /// The index of the first frame to return; if omitted frames start at 0.
+  uint32_t startFrame = 0;
+
+  /// The maximum number of frames to return. If levels is not specified or 0,
+  /// all frames are returned.
+  uint32_t levels = 0;
+
+  /// Specifies details on how to format the returned `StackFrame.name`. The
+  /// debug adapter may format requested details in any way that would make
+  /// sense to a developer. The attribute is only honored by a debug adapter if
+  /// the corresponding capability `supportsValueFormattingOptions` is true.
+  std::optional<StackFrameFormat> format;
+};
+bool fromJSON(const llvm::json::Value &, StackTraceArguments &,
+              llvm::json::Path);
+
+/// Response to `stackTrace` request.
+struct StackTraceResponseBody {
+  /// The frames of the stack frame. If the array has length zero, there are no
+  /// stack frames available.
+  /// This means that there is no location information available.
+  std::vector<StackFrame> stackFrames;
+
+  /// The total number of frames available in the stack. If omitted or if
+  /// `totalFrames` is larger than the available frames, a client is expected to
+  /// request frames until a request returns less frames than requested (which
+  /// indicates the end of the stack). Returning monotonically increasing
+  /// `totalFrames` values for subsequent requests can be used to enforce paging
+  /// in the client.
+  uint32_t totalFrames = 0;
+};
+llvm::json::Value toJSON(const StackTraceResponseBody &);
+
 } // namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
index c7f7c447b5b6f..17e2b4b73f532 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/JSON.h"
+#include <cassert>
 #include <optional>
 
 using namespace llvm;
@@ -1174,4 +1175,59 @@ llvm::json::Value toJSON(const CompileUnit &CU) {
   return result;
 }
 
+bool fromJSON(const llvm::json::Value &Params, StackFrameFormat &SFF,
+              llvm::json::Path Path) {
+  json::ObjectMapper O(Params, Path);
+  return O && O.mapOptional("parameters", SFF.parameters) &&
+         O.mapOptional("parameterTypes", SFF.parameterTypes) &&
+         O.mapOptional("parameterNames", SFF.parameterNames) &&
+         O.mapOptional("parameterValues", SFF.parameterValues) &&
+         O.mapOptional("line", SFF.line) &&
+         O.mapOptional("module", SFF.module) &&
+         O.mapOptional("includeAll", SFF.includeAll);
+}
+
+llvm::json::Value toJSON(const StackFrame::PresentationHint &PH) {
+  switch (PH) {
+  case StackFrame::ePresentationHintNone:
+    return "";
+  case StackFrame::ePresentationHintNormal:
+    return "normal";
+  case StackFrame::ePresentationHintLabel:
+    return "label";
+  case StackFrame::ePresentationHintSubtle:
+    return "subtle";
+  }
+  llvm_unreachable("unhandled stackFrame presentationHint.");
+}
+
+llvm::json::Value toJSON(const StackFrame &SF) {
+  json::Object result{{"id", SF.id}, {"name", SF.name}};
+
+  if (SF.source) {
+    result.insert({"source", *SF.source});
+    assert(SF.line != LLDB_INVALID_LINE_NUMBER);
+    result.insert({"line", SF.line});
+    result.insert({"column", SF.column});
+    if (SF.endLine != 0 && SF.endLine != LLDB_INVALID_LINE_NUMBER)
+      result.insert({"endLine", SF.endLine});
+    if (SF.endColumn != 0 && SF.endColumn != LLDB_INVALID_COLUMN_NUMBER)
+      result.insert({"endColumn", SF.endColumn});
+  } else {
+    result.insert({"line", 0});
+    result.insert({"column", 0});
+  }
+  if (SF.canRestart)
+    result.insert({"canRestart", SF.canRestart});
+  if (SF.instructionPointerReference != LLDB_INVALID_ADDRESS)
+    result.insert({"instructionPointerReference",
+                   EncodeMemoryReference(SF.instructionPointerReference)});
+  if (SF.moduleId)
+    result.insert({"moduleId", *SF.moduleId});
+  if (SF.presentationHint != StackFrame::ePresentationHintNone)
+    result.insert({"presentationHint", SF.presentationHint});
+
+  return result;
+}
+
 } // namespace lldb_dap::protocol
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index 4ead4786bc661..3433dc74d5b31 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -22,6 +22,7 @@
 
 #include "Protocol/DAPTypes.h"
 #include "lldb/lldb-defines.h"
+#include "lldb/lldb-types.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/JSON.h"
 #include <cstdint>
@@ -31,6 +32,7 @@
 #define LLDB_DAP_INVALID_VAR_REF INT64_MAX
 #define LLDB_DAP_INVALID_SRC_REF 0
 #define LLDB_DAP_INVALID_VALUE_LOC 0
+#define LLDB_DAP_INVALID_STACK_FRAME_ID UINT64_MAX
 
 namespace lldb_dap::protocol {
 
@@ -1044,6 +1046,93 @@ struct CompileUnit {
 };
 llvm::json::Value toJSON(const CompileUnit &);
 
+/// Provides formatting information for a stack frame.
+struct StackFrameFormat {
+  /// Displays parameters for the stack frame.
+  bool parameters = false;
+
+  /// Displays the types of parameters for the stack frame.
+  bool parameterTypes = false;
+
+  /// Displays the names of parameters for the stack frame.
+  bool parameterNames = false;
+
+  /// Displays the values of parameters for the stack frame.
+  bool parameterValues = false;
+
+  /// Displays the line number of the stack frame.
+  bool line = false;
+
+  /// Displays the module of the stack frame.
+  bool module = false;
+
+  /// Includes all stack frames, including those the debug adapter might
+  /// otherwise hide.
+  bool includeAll = false;
+};
+bool fromJSON(const llvm::json::Value &, StackFrameFormat &, llvm::json::Path);
+
+/// A Stackframe contains the source location.
+struct StackFrame {
+  enum PresentationHint : unsigned {
+    ePresentationHintNone,
+    ePresentationHintNormal,
+    ePresentationHintLabel,
+    ePresentationHintSubtle,
+  };
+
+  /// An identifier for the stack frame. It must be unique across all threads.
+  /// This id can be used to retrieve the scopes of the frame with the `scopes`
+  /// request or to restart the execution of a stack frame.
+  lldb::tid_t id = LLDB_DAP_INVALID_STACK_FRAME_ID;
+
+  /// The name of the stack frame, typically a method name.
+  std::string name;
+
+  /// The source of the frame.
+  std::optional<Source> source;
+
+  /// The line within the source of the frame. If the source attribute is
+  /// missing or doesn't exist, `line` is 0 and should be ignored by the client.
+  uint32_t line = LLDB_INVALID_LINE_NUMBER;
+
+  /// Start position of the range covered by the stack frame. It is measured in
+  /// UTF-16 code units and the client capability `columnsStartAt1` determines
+  /// whether it is 0- or 1-based. If attribute `source` is missing or doesn't
+  /// exist, `column` is 0 and should be ignored by the client.
+  uint32_t column = LLDB_INVALID_COLUMN_NUMBER;
+
+  /// The end line of the range covered by the stack frame.
+  uint32_t endLine = LLDB_INVALID_LINE_NUMBER;
+
+  /// End position of the range covered by the stack frame. It is measured in
+  /// UTF-16 code units and the client capability `columnsStartAt1` determines
+  /// whether it is 0- or 1-based.
+  uint32_t endColumn = LLDB_INVALID_COLUMN_NUMBER;
+
+  /// Indicates whether this frame can be restarted with the `restartFrame`
+  /// request. Clients should only use this if the debug adapter supports the
+  /// `restart` request and the corresponding capability `supportsRestartFrame`
+  /// is true. If a debug adapter has this capability, then `canRestart`
+  /// defaults to `true` if the property is absent.
+  bool canRestart = false;
+
+  /// A memory reference for the current instruction pointer in this frame.
+  lldb::addr_t instructionPointerReference = LLDB_INVALID_ADDRESS;
+
+  /// The module associated with this frame, if any.
+  std::optional<std::string> moduleId;
+
+  /// A hint for how to present this frame in the UI. A value of `label` can be
+  /// used to indicate that the frame is an artificial frame that is used as a
+  /// visual label or separator. A value of `subtle` can be used to change the
+  /// appearance of a frame in a 'subtle' way. Values: 'normal', 'label',
+  /// 'subtle'
+  PresentationHint presentationHint = ePresentationHintNone;
+};
+llvm::json::Value toJSON(const StackFrame::PresentationHint &);
+llvm::json::Value toJSON(const StackFrame &);
+
 } // namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
index c639e40453fb0..cdc012b448c8f 100644
--- a/lldb/unittests/DAP/ProtocolRequestsTest.cpp
+++ b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
@@ -344,3 +344,71 @@ TEST(ProtocolRequestsTest, RestartArguments) {
   EXPECT_NE(attach_args, nullptr);
   EXPECT_EQ(attach_args->pid, 123U);
 }
+
+TEST(ProtocolRequestsTest, StackTraceArguments) {
+  llvm::Expected<StackTraceArguments> expected = parse<StackTraceArguments>(R"({
+    "threadId": 42,
+    "startFrame": 1,
+    "levels": 10,
+    "format": {
+      "parameters": true,
+      "line": true
+    }
+  })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(expected->threadId, 42U);
+  EXPECT_EQ(expected->startFrame, 1U);
+  EXPECT_EQ(expected->levels, 10U);
+  EXPECT_EQ(expected->format->parameters, true);
+  EXPECT_EQ(expected->format->line, true);
+
+  // Check required keys.
+  EXPECT_THAT_EXPECTED(parse<StackTraceArguments>(R"({})"),
+                       FailedWithMessage("missing value at (root).threadId"));
+}
+
+TEST(ProtocolRequestsTest, StackTraceResponseBody) {
+  StackFrame frame1;
+  frame1.id = 1;
+  frame1.name = "main";
+  frame1.source = Source{};
+  frame1.source->name = "main.cpp";
+  frame1.source->sourceReference = 123;
+  frame1.line = 23;
+  frame1.column = 1;
+  StackFrame frame2;
+  frame2.id = 2;
+  frame2.name = "test";
+  frame2.presentationHint = StackFrame::ePresentationHintLabel;
+
+  StackTraceResponseBody body;
+  body.stackFrames = {frame1, frame2};
+  body.totalFrames = 2;
+
+  // Check required keys.
+  Expected<json::Value> expected = parse(R"({
+    "stackFrames": [
+      {
+        "id": 1,
+        "name": "main",
+        "source": {
+          "name": "main.cpp",
+          "sourceReference": 123
+        },
+        "line": 23,
+        "column": 1
+      },
+      {
+        "id": 2,
+        "name": "test",
+        "line": 0,
+        "column": 0,
+        "presentationHint": "label"
+      }
+    ],
+    "totalFrames": 2
+  })");
+
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body));
+}
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index 6a4620a3f1e59..f842b6c2a20e3 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -1176,3 +1176,89 @@ TEST(ProtocolTypesTest, ExceptionDetails) {
   ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded());
   EXPECT_EQ(pp(*expected_opt), pp(details));
 }
+
+TEST(ProtocolTypesTest, StackFramePresentationHint) {
+  // Test all PresentationHint values.
+  std::vector<std::pair<StackFrame::PresentationHint, llvm::StringRef>>
+      test_cases = {{StackFrame::ePresentationHintNormal, "normal"},
+                    {StackFrame::ePresentationHintLabel, "label"},
+                    {StackFrame::ePresentationHintSubtle, "subtle"}};
+
+  for (const auto &test_case : test_cases) {
+    // Serialize the PresentationHint to JSON.
+    llvm::json::Value serialized = toJSON(test_case.first);
+    ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String);
+    EXPECT_EQ(serialized.getAsString(), test_case.second);
+  }
+}
+
+TEST(ProtocolTypesTest, StackFrameFormat) {
+  llvm::Expected<StackFrameFormat> expected = parse<StackFrameFormat>(R"({})");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(expected->parameters, false);
+  EXPECT_EQ(expected->parameterTypes, false);
+  EXPECT_EQ(expected->parameterNames, false);
+  EXPECT_EQ(expected->parameterValues, false);
+  EXPECT_EQ(expected->line, false);
+  EXPECT_EQ(expected->module, false);
+  EXPECT_EQ(expected->includeAll, false);
+
+  expected = parse<StackFrameFormat>(R"({
+    "line": true,
+    "parameterNames": true
+  })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(expected->parameters, false);
+  EXPECT_EQ(expected->parameterTypes, false);
+  EXPECT_EQ(expected->parameterNames, true);
+  EXPECT_EQ(expected->parameterValues, false);
+  EXPECT_EQ(expected->line, true);
+  EXPECT_EQ(expected->module, false);
+  EXPECT_EQ(expected->includeAll, false);
+}
+
+TEST(ProtocolTypesTest, StackFrame) {
+  StackFrame frame;
+  frame.id = 1;
+  frame.name = "test";
+  frame.source = Source{};
+  frame.source->name = "test.cpp";
+  frame.source->sourceReference = 23;
+  frame.line = 10;
+  frame.column = 1;
+  frame.presentationHint = StackFrame::ePresentationHintNormal;
+
+  Expected<json::Value> expected_frame = parse(R"({
+    "id": 1,
+    "name": "test",
+    "source": {
+      "name": "test.cpp",
+      "sourceReference": 23
+    },
+    "line": 10,
+    "column": 1,
+    "presentationHint": "normal"
+  })");
+
+  ASSERT_THAT_EXPECTED(expected_frame, llvm::Succeeded());
+  EXPECT_EQ(pp(*expected_frame), pp(frame));
+
+  frame.id = 2;
+  frame.canRestart = true;
+  frame.source = std::nullopt;
+  frame.presentationHint = StackFrame::ePresentationHintSubtle;
+  frame.name = "foo";
+  frame.instructionPointerReference = 12345;
+  expected_frame = parse(R"({
+    "id": 2,
+    "name": "foo",
+    "line": 0,
+    "column": 0,
+    "canRestart": true,
+    "instructionPointerReference": "0x3039",
+    "presentationHint": "subtle"
+  })");
+
+  ASSERT_THAT_EXPECTED(expected_frame, llvm::Succeeded());
+  EXPECT_EQ(pp(*expected_frame), pp(frame));
+}

>From 42ea774aa6232926b07d27b3a1b6ac9f1f0e06a0 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Sun, 21 Dec 2025 14:45:48 -0500
Subject: [PATCH 08/34] [SLP]Enable float point math ops as copyables elements.

Patch enables support for float point math operations as base
instructions for copyable elements. It also fixes some scheduling
issues, found during testing

Reviewers: hiraditya, RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/169857

Recommit after reverts in 9008922707915a6632fb74ed301bce11d8775e2a and
c2441689830fcb2588673dedba98da1219a2fb9e.
c2441689830fcb2588673dedba98da1219a2fb9e was caused by other issues, not
related to this patch directly
---
 llvm/include/llvm/IR/Instruction.h            |   6 +
 llvm/include/llvm/IR/IntrinsicInst.h          |   6 +
 llvm/lib/IR/Instruction.cpp                   |   7 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  90 +++++++++---
 .../AArch64/shuffle-vectors-mask-size.ll      |   6 +-
 .../X86/bv-root-part-of-graph.ll              |  11 +-
 .../SLPVectorizer/X86/crash_smallpt.ll        |  22 +--
 .../entry-no-bundle-but-extra-use-on-vec.ll   |  35 +++--
 .../extractelement-single-use-many-nodes.ll   |   3 +-
 .../X86/multi-node-for-copyable-parent.ll     |  15 +-
 .../X86/multi-node-user-with-copyable-ops.ll  |  19 +--
 .../non-commutative-op-in-commutative-inst.ll |  12 +-
 .../SLPVectorizer/X86/propagate-mmra.ll       |   4 +-
 .../reused-last-instruction-in-split-node.ll  |  24 +--
 .../X86/same-operands-but-copyable.ll         |   2 +-
 .../X86/user-with-multi-copyable-ops.ll       |  44 +++---
 .../X86/vect_copyable_in_binops.ll            | 128 ++++------------
 .../SLPVectorizer/alternate-non-profitable.ll |  11 +-
 .../SLPVectorizer/crash_exceed_scheduling.ll  | 138 ++++++++++++------
 .../extract-many-users-buildvector.ll         |  72 +++++----
 .../SLPVectorizer/insertelement-postpone.ll   |  40 ++---
 21 files changed, 351 insertions(+), 344 deletions(-)

diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 2eb4fd36c5b7d..11385666e7ff8 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -762,6 +762,12 @@ class Instruction : public User,
   /// applied to any type.
   ///
   LLVM_ABI bool isCommutative() const LLVM_READONLY;
+
+  /// Checks if the operand is commutative. In commutative operations, not all
+  /// operands might commutable, e.g. for fmuladd only 2 first operands are
+  /// commutable.
+  LLVM_ABI bool isCommutableOperand(unsigned Op) const LLVM_READONLY;
+
   static bool isCommutative(unsigned Opcode) {
     switch (Opcode) {
     case Add: case FAdd:
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 0622bfae2c845..0b25baa465a71 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -101,6 +101,12 @@ class IntrinsicInst : public CallInst {
     }
   }
 
+  /// Return true if the operand is commutable.
+  bool isCommutableOperand(unsigned Op) const {
+    constexpr unsigned NumCommutativeOps = 2;
+    return isCommutative() && Op < NumCommutativeOps;
+  }
+
   /// Checks if the intrinsic is an annotation.
   bool isAssumeLikeIntrinsic() const {
     switch (getIntrinsicID()) {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index f3d4d2424fe5b..7682c28e23b33 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1293,6 +1293,13 @@ bool Instruction::isCommutative() const {
   return isCommutative(getOpcode());
 }
 
+bool Instruction::isCommutableOperand(unsigned Op) const {
+  if (auto *II = dyn_cast<IntrinsicInst>(this))
+    return II->isCommutableOperand(Op);
+  // TODO: Should allow icmp/fcmp?
+  return isCommutative(getOpcode());
+}
+
 unsigned Instruction::getNumSuccessors() const {
   switch (getOpcode()) {
 #define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6fd9759521543..6e05fcfe421cb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -575,6 +575,27 @@ static bool isCommutative(Instruction *I, Value *ValWithUses,
   return I->isCommutative();
 }
 
+/// Checks if the operand is commutative. In commutative operations, not all
+/// operands might commutable, e.g. for fmuladd only 2 first operands are
+/// commutable.
+static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
+                                bool IsCopyable = false) {
+  assert(::isCommutative(I, ValWithUses, IsCopyable) &&
+         "The instruction is not commutative.");
+  if (isa<CmpInst>(I))
+    return true;
+  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+    switch (BO->getOpcode()) {
+    case Instruction::Sub:
+    case Instruction::FSub:
+      return true;
+    default:
+      break;
+    }
+  }
+  return I->isCommutableOperand(Op);
+}
+
 /// This is a helper function to check whether \p I is commutative.
 /// This is a convenience wrapper that calls the two-parameter version of
 /// isCommutative with the same instruction for both parameters. This is
@@ -5328,13 +5349,14 @@ class slpvectorizer::BoUpSLP {
       if (ScheduleCopyableDataMap.empty())
         return false;
       SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
-      SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
       ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
       if (Entries.empty())
         return false;
+      unsigned CurNumOps = 0;
       for (const Use &U : User->operands()) {
         if (U.get() != Op)
           continue;
+        ++CurNumOps;
         // Check all tree entries, if they have operands replaced by copyable
         // data.
         for (TreeEntry *TE : Entries) {
@@ -5367,27 +5389,43 @@ class slpvectorizer::BoUpSLP {
           // Same applies even for non-commutative cmps, because we can invert
           // their predicate potentially and, thus, reorder the operands.
           bool IsCommutativeUser =
-              ::isCommutative(User) ||
-              ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
-          if (!IsCommutativeUser && !isa<CmpInst>(User)) {
-            unsigned &OpCnt =
-                OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
+              ::isCommutative(User) &&
+              ::isCommutableOperand(User, User, U.getOperandNo());
+          if (!IsCommutativeUser) {
+            Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
+            IsCommutativeUser =
+                ::isCommutative(MainOp, User) &&
+                ::isCommutableOperand(MainOp, User, U.getOperandNo());
+          }
+          // The commutative user with the same operands can be safely
+          // considered as non-commutative, operands reordering does not change
+          // the semantics.
+          assert(
+              (!IsCommutativeUser ||
+               (((::isCommutative(User) &&
+                  ::isCommutableOperand(User, User, 0) &&
+                  ::isCommutableOperand(User, User, 1)) ||
+                 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
+                  ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
+                                        User, 0) &&
+                  ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
+                                        User, 1))))) &&
+              "Expected commutative user with 2 first commutable operands");
+          bool IsCommutativeWithSameOps =
+              IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
+          if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
+              !isa<CmpInst>(User)) {
             EdgeInfo EI(TE, U.getOperandNo());
-            if (!getScheduleCopyableData(EI, Op))
+            if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
               continue;
-            // Found copyable operand - continue.
-            OpCnt += Inc;
-            continue;
+            return false;
           }
           PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
               .first->getSecond() += Inc;
         }
       }
       if (PotentiallyReorderedEntriesCount.empty())
-        return all_of(OrderedEntriesCount,
-                      [&](const std::pair<const TreeEntry *, unsigned> &P) {
-                        return P.second == NumOps;
-                      });
+        return true;
       // Check the commutative/cmp entries.
       for (auto &P : PotentiallyReorderedEntriesCount) {
         SmallPtrSet<Value *, 4> ParentsUniqueUsers;
@@ -5433,10 +5471,6 @@ class slpvectorizer::BoUpSLP {
       return all_of(PotentiallyReorderedEntriesCount,
                     [&](const std::pair<const TreeEntry *, unsigned> &P) {
                       return P.second == NumOps - 1;
-                    }) &&
-             all_of(OrderedEntriesCount,
-                    [&](const std::pair<const TreeEntry *, unsigned> &P) {
-                      return P.second == NumOps;
                     });
     }
 
@@ -5647,17 +5681,22 @@ class slpvectorizer::BoUpSLP {
                 auto It = OperandsUses.find(I);
                 assert(It != OperandsUses.end() && "Operand not found");
                 if (It->second > 0) {
-                  --It->getSecond();
-                  assert(TotalOpCount > 0 && "No more operands to decrement");
-                  --TotalOpCount;
                   if (ScheduleData *OpSD = getScheduleData(I)) {
                     if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
                       return;
+                    --It->getSecond();
+                    assert(TotalOpCount > 0 && "No more operands to decrement");
+                    --TotalOpCount;
                     DecrUnsched(OpSD, /*IsControl=*/false);
+                  } else {
+                    --It->getSecond();
+                    assert(TotalOpCount > 0 && "No more operands to decrement");
+                    --TotalOpCount;
                   }
                 }
               };
 
+          SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
@@ -5665,7 +5704,6 @@ class slpvectorizer::BoUpSLP {
             // Need to search for the lane since the tree entry can be
             // reordered.
             auto *It = find(Bundle->getTreeEntry()->Scalars, In);
-            SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
             bool IsNonSchedulableWithParentPhiNode =
                 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
                 Bundle->getTreeEntry()->UserTreeIndex &&
@@ -10876,7 +10914,9 @@ class InstructionsCompatibilityAnalysis {
            Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
            Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
            Opcode == Instruction::And || Opcode == Instruction::Or ||
-           Opcode == Instruction::Xor;
+           Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
+           Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
+           Opcode == Instruction::FDiv;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -11217,6 +11257,10 @@ class InstructionsCompatibilityAnalysis {
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
+      case Instruction::FAdd:
+      case Instruction::FMul:
+      case Instruction::FSub:
+      case Instruction::FDiv:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
index 0783a28f56d85..961662c664a31 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
@@ -11,10 +11,10 @@ define void @p(double %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> <i32 1, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> <double 1.000000e+00, double 1.000000e+00, double poison, double poison>, <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x double> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
index 0cc4d3db5c537..1abc16da77c8e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
@@ -4,15 +4,16 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP9:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[FMUL:%.*]] = fmul float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[FMUL:%.*]] = sitofp i32 0 to float
+; CHECK-NEXT:    [[SITOFP:%.*]] = sitofp i32 0 to float
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[SITOFP]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 0, i32 0, i32 poison, i32 7>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FMUL]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index d13a8578d1e00..c1cc3f2dfc9e5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -7,36 +7,30 @@
 define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    br label [[INVOKE_CONT:%.*]]
 ; CHECK:       invoke.cont:
-; CHECK-NEXT:    br i1 %arg, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
 ; CHECK:       arrayctor.cont:
 ; CHECK-NEXT:    [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER:%.*]]
 ; CHECK:       for.cond36.preheader:
-; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
 ; CHECK:       cond.false51.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true48.us:
-; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
 ; CHECK:       cond.false66.us:
-; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double poison, double 0xBFA5CC2D1960285F>, double [[ADD_I276_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> <double 0.000000e+00, double 1.000000e-01>, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], splat (double 1.400000e+02)
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr undef, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 2.000000e-01, double 3.000000e-01>, [[TMP1]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
+; CHECK-NEXT:    store <2 x double> <double 0x404900049667B5F2, double 0x404E0515D587DA7B>, ptr undef, align 8
+; CHECK-NEXT:    store <2 x double> <double 2.000000e-07, double 0x3F91A436DC4B6CE6>, ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
 ; CHECK-NEXT:    ret void
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.body42.lr.ph.us:
-; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
 ; CHECK:       _Z5clampd.exit.1:
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER]]
 ;
@@ -96,7 +90,7 @@ _Z5clampd.exit.1:
 define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %arg, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
 ; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    store <2 x double> <double 0x3FFA356C1D8A7F76, double 0x3FFDC4F38B38BEF4>, ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index 6d713e83bbf4e..ca65ff88a4b81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -9,33 +9,38 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[DIV_2_I_I:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP20]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 0x7FF8000000000000, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 1.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FA_SROA_9_0:%.*]] = phi float [ [[DIV_2_I_I]], %[[IF_THEN]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x float> [ [[TMP10]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP22]], float [[FA_SROA_9_0]], i32 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP28]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x float> <float 1.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul <4 x float> <float -0.000000e+00, float -0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP31]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fadd <4 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP32]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[CALL25]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 6942df532ae29..91ec61b275205 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -25,8 +25,7 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]]
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[I82]], i32 3
-; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
index a07e617384e09..fd7f0c61b6737 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
@@ -6,14 +6,17 @@ define i1 @test(double %circ_radius, ptr %x) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP0]], 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[ADD20]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double poison, double poison>, double [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <4 x double> <double -0.000000e+00, double 0.000000e+00, double 1.000000e+00, double -0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> <double poison, double poison, double 0.000000e+00, double poison>, <4 x i32> <i32 1, i32 2, i32 6, i32 0>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP8]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp olt <4 x double> [[TMP9]], splat (double 1.000000e+00)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
index eb3b183fd49eb..a9baedef3e509 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
@@ -6,17 +6,18 @@ define i1 @test(double %circ_radius, ptr %x, double %0) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]], double [[TMP0:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP1]], 0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD20]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP16]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], <double -0.000000e+00, double 0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double 0.000000e+00, double 1.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x double> [[TMP9]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP7]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP12]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x double> [[TMP13]], splat (double 1.000000e+00)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
index 8c684325f8c68..b71dbc49e7478 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
@@ -8,13 +8,11 @@ define void @test(ptr %quat, float %call13) {
 ; CHECK-SAME: ptr [[QUAT:%.*]], float [[CALL13:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL121:%.*]] = load volatile float, ptr null, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fmuladd.f32(float [[CALL13]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[CALL121]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 0.000000e+00
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[CALL121]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP2]], <float 0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> zeroinitializer, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[QUAT]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll b/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
index ba52ef4c462a2..a84c6ae6b0980 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
@@ -5,9 +5,9 @@ define void @foo() {
 ; CHECK-LABEL: define void @foo() {
 ; CHECK-NEXT:  [[_PREHEADER16_PREHEADER:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4, !mmra [[META0:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 1.000000e+00, float 0.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP4]], <2 x float> zeroinitializer
 ; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr null, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
index f101991648276..6dc9806da0aa9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
@@ -4,9 +4,7 @@
 define float @test() {
 ; CHECK-LABEL: define float @test() {
 ; CHECK-NEXT:  [[LABEL:.*]]:
-; CHECK-NEXT:    [[SUB_I102_I:%.*]] = fsub float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float poison, float poison>, float [[SUB_I102_I]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float 0.000000e+00, i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -14,26 +12,12 @@ define float @test() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <8 x float> [[TMP7]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> <float poison, float 1.000000e+00>, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> zeroinitializer, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <12 x float> [[TMP16]], <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <8 x float> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> [[TMP20]], [[TMP8]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <12 x float> [[TMP22]], <12 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP21]], <8 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <20 x float> [[TMP23]], <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <20 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
 ; CHECK-NEXT:    br label %[[REGION_30:.*]]
 ; CHECK:       [[REGION_30]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP25]], %[[LABEL]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP10]], %[[LABEL]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <20 x float> [[TMP26]], i32 7
 ; CHECK-NEXT:    ret float [[TMP27]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
index 3645ad89af624..f1031937180a3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
@@ -7,10 +7,10 @@ define void @test(ptr %0, ptr %1, float %.sroa.3232.0.copyload) {
 ; CHECK-NEXT:  [[BB:.*:]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 12
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP3]], <float 0.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP3]], float [[DOTSROA_3232_0_COPYLOAD]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP5]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP5]], [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
index 7b298723d93b5..c58c63e51737c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
@@ -11,30 +11,23 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[ARRAYIDX_I1464:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX_I1464]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[THIS]], align 8
-; CHECK-NEXT:    [[DIV251:%.*]] = fmul double [[TMP1]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL257:%.*]] = fmul double [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL305:%.*]] = fmul double [[TMP4]], 0.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
-; CHECK-NEXT:    [[NEG356:%.*]] = fmul double [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG356]], double 0.000000e+00, double 0.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[THIS]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg double [[TMP3]]
 ; CHECK-NEXT:    [[NEG380:%.*]] = fmul double [[TMP1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double [[MUL257]])
 ; CHECK-NEXT:    [[FNEG381:%.*]] = fneg double [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[DIV251]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FNEG381]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[NEG417:%.*]] = fneg double [[MUL257]]
-; CHECK-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG417]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[FNEG418:%.*]] = fneg double [[TMP16]]
-; CHECK-NEXT:    [[MUL419:%.*]] = fmul double [[DIV251]], [[FNEG418]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[MUL257]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fneg <2 x double> [[TMP11]]
 ; CHECK-NEXT:    [[NEG436:%.*]] = fmul double [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[FNEG437:%.*]] = fneg double [[TMP17]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> <double 1.000000e+00, double poison>, double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP14]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = fneg <2 x double> [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fneg double [[TMP4]]
 ; CHECK-NEXT:    [[NEG455:%.*]] = fmul double [[TMP1]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG455]], double 0.000000e+00, double [[MUL305]])
@@ -42,19 +35,18 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[FNEG474:%.*]] = fneg double [[TMP20]]
 ; CHECK-NEXT:    [[NEG492:%.*]] = fneg double [[MUL305]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG492]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x double> poison, double [[DIV251]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x double> poison, double [[FNEG437]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> [[TMP24]], double [[TMP19]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[FNEG474]], i32 2
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP28:%.*]] = fmul <4 x double> [[TMP23]], [[TMP27]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> <double 1.000000e+00, double 0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP23]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x double> poison, double [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> [[TMP29]], double [[FNEG381]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x double> [[TMP25]], double [[TMP10]], i32 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x double> [[TMP29]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[MUL419]], i32 3
-; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x double> [[TMP28]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP28]], double [[TMP19]], i32 5
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x double> [[TMP32]], double [[FNEG474]], i32 6
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x double> [[TMP33]], double [[TMP21]], i32 7
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul <8 x double> [[TMP31]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = fptrunc <8 x double> [[TMP34]] to <8 x float>
 ; CHECK-NEXT:    store <8 x float> [[TMP35]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 2a0e7889f0f34..d10d26671e76b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
 
 define void @add0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @add0(
@@ -336,32 +336,12 @@ entry:
 }
 
 define void @add1f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @add1f(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd fast <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @add1f(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
-; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: @add1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -387,18 +367,9 @@ entry:
 define void @sub0f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -565,18 +536,9 @@ entry:
 define void @mulf(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -631,32 +593,12 @@ entry:
 }
 
 define void @add1fn(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @add1fn(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @add1fn(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
-; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: @add1fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -682,18 +624,9 @@ entry:
 define void @sub0fn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0fn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -780,18 +713,9 @@ entry:
 define void @mulfn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulfn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index 125c2dce32663..b23da5fa263f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -52,11 +52,12 @@ define <2 x float> @replace_through_casts_and_binop(i16 %inp) {
 ; CHECK-SAME: i16 [[INP:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[INP]], -10
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i16 [[INP]], 5
-; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i16 [[MUL]] to float
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 2.000000e+00
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp i16 [[ADD]] to float
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[MUL]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> [[TMP1]], i16 [[ADD]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[TMP5]], <float 2.000000e+00, float -0.000000e+00>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %add = add nsw i16 %inp, -10
diff --git a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
index 793d089404d1e..c79969de6ac41 100644
--- a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
@@ -1,52 +1,98 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
 
 define void @exceed(double %0, double %1) {
-; CHECK-LABEL: @exceed(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
-; CHECK-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
-; CHECK-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
-; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
-; CHECK-NEXT:    switch i32 undef, label [[BB1:%.*]] [
-; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       bb1:
-; CHECK-NEXT:    br label [[LABEL:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    br label [[LABEL]]
-; CHECK:       label:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
-; CHECK-NEXT:    ret void
+; X86-LABEL: @exceed(
+; X86-NEXT:  entry:
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; X86-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; X86-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; X86-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; X86-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
+; X86-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
+; X86-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; X86-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
+; X86-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; X86-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
+; X86-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
+; X86-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
+; X86-NEXT:    switch i32 undef, label [[BB1:%.*]] [
+; X86-NEXT:      i32 0, label [[BB2:%.*]]
+; X86-NEXT:    ]
+; X86:       bb1:
+; X86-NEXT:    br label [[LABEL:%.*]]
+; X86:       bb2:
+; X86-NEXT:    br label [[LABEL]]
+; X86:       label:
+; X86-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: @exceed(
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; AARCH64-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; AARCH64-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; AARCH64-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; AARCH64-NEXT:    [[IX2:%.*]] = fmul double [[TMP7]], [[TMP7]]
+; AARCH64-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> <double poison, double 1.000000e+00>, <2 x i32> <i32 0, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = fdiv fast <2 x double> [[TMP9]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; AARCH64-NEXT:    [[IX:%.*]] = fmul double [[TMP12]], undef
+; AARCH64-NEXT:    [[IX1:%.*]] = fmul double [[TMP12]], undef
+; AARCH64-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], [[TMP8]]
+; AARCH64-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP11]], undef
+; AARCH64-NEXT:    switch i32 undef, label [[BB1:%.*]] [
+; AARCH64-NEXT:      i32 0, label [[BB2:%.*]]
+; AARCH64-NEXT:    ]
+; AARCH64:       bb1:
+; AARCH64-NEXT:    br label [[LABEL:%.*]]
+; AARCH64:       bb2:
+; AARCH64-NEXT:    br label [[LABEL]]
+; AARCH64:       label:
+; AARCH64-NEXT:    [[TMP16:%.*]] = phi <2 x double> [ [[TMP14]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
+; AARCH64-NEXT:    ret void
 ;
 entry:
   %i10 = fdiv fast double %0, %1
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 32e59697486a7..439943102b58a 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -7,56 +7,52 @@ define i1 @test(float %0, double %1) {
 ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; X86-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
-; X86-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; X86-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
+; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
+; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; X86-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
 ; X86-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
+; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP6]], [[TMP12]]
 ; X86-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; X86-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP16]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
-; X86-NEXT:    [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]]
-; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]]
-; X86-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; X86-NEXT:    [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float>
-; X86-NEXT:    [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer
-; X86-NEXT:    [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer
-; X86-NEXT:    [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]]
-; X86-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]])
-; X86-NEXT:    ret i1 [[TMP25]]
+; X86-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP15]], [[TMP10]]
+; X86-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP15]], [[TMP10]]
+; X86-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
+; X86-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
+; X86-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
+; X86-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
+; X86-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
+; X86-NEXT:    ret i1 [[TMP23]]
 ;
 ; AARCH64-LABEL: define i1 @test
 ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; AARCH64-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
-; AARCH64-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; AARCH64-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
-; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
-; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
+; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AARCH64-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
+; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
 ; AARCH64-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP10]], [[TMP13]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP6]], [[TMP13]]
 ; AARCH64-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP16:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP15]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP17:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP19:%.*]] = fsub <8 x double> [[TMP16]], [[TMP18]]
-; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x double> [[TMP16]], [[TMP18]]
-; AARCH64-NEXT:    [[TMP21:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; AARCH64-NEXT:    [[TMP22:%.*]] = fptrunc <8 x double> [[TMP21]] to <8 x float>
-; AARCH64-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], zeroinitializer
-; AARCH64-NEXT:    [[TMP24:%.*]] = fcmp oeq <8 x float> [[TMP23]], zeroinitializer
-; AARCH64-NEXT:    [[TMP25:%.*]] = freeze <8 x i1> [[TMP24]]
-; AARCH64-NEXT:    [[TMP26:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP25]])
-; AARCH64-NEXT:    ret i1 [[TMP26]]
+; AARCH64-NEXT:    [[TMP17:%.*]] = fsub <8 x double> [[TMP16]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP18:%.*]] = fmul <8 x double> [[TMP16]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP20:%.*]] = fptrunc <8 x double> [[TMP19]] to <8 x float>
+; AARCH64-NEXT:    [[TMP21:%.*]] = fmul <8 x float> [[TMP20]], zeroinitializer
+; AARCH64-NEXT:    [[TMP22:%.*]] = fcmp oeq <8 x float> [[TMP21]], zeroinitializer
+; AARCH64-NEXT:    [[TMP23:%.*]] = freeze <8 x i1> [[TMP22]]
+; AARCH64-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP23]])
+; AARCH64-NEXT:    ret i1 [[TMP24]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index eefc99feebb95..09e3ef41b3dbe 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -6,34 +6,34 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778)
 ; X86-LABEL: @test(
 ; X86-NEXT:  entry:
 ; X86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; X86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; X86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; X86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; X86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
-; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
+; X86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
 ; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
-; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
-; X86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
-; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
+; X86-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
 ; X86-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; X86-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AARCH86-LABEL: @test(
 ; AARCH86-NEXT:  entry:
 ; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
-; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; AARCH86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
+; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
+; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
 ; AARCH86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
-; AARCH86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
-; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
-; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
-; AARCH86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
+; AARCH86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
+; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
+; AARCH86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
+; AARCH86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+; AARCH86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; AARCH86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
 ; AARCH86-NEXT:    [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;

>From 295a01f36c7ad0f9cacd10e6627993e06d8376e9 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Fri, 26 Dec 2025 12:48:53 -0800
Subject: [PATCH 09/34] [MemProf] Fix reporting with
 -memprof-matching-cold-threshold (#173327)

With the -memprof-matching-cold-threshold option, we hint as cold
allocations where the fraction of cold bytes is at least the given
threshold. However, we were incorrectly reporting all of the
allocation's contexts and bytes as hinted cold.

Fix this to report the non-cold contexts as ignored. To do this,
refactor out some existing reporting, and also keep track of the
original allocation type for each context in the Trie along with its
ContextTotalSize information. Most of the changes are the change to this
array's type and name.
---
 .../include/llvm/Analysis/MemoryProfileInfo.h | 23 +++++--
 llvm/lib/Analysis/MemoryProfileInfo.cpp       | 63 +++++++++++++------
 llvm/test/Transforms/PGOProfile/memprof.ll    |  4 +-
 3 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index ba4010bd8f50c..cdd8ce0c66771 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -61,6 +61,15 @@ LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB);
 /// profile but that we haven't yet been able to disambiguate.
 LLVM_ABI void addAmbiguousAttribute(CallBase *CB);
 
+// During matching we also keep the AllocationType along with the
+// ContextTotalSize in the Trie for the most accurate reporting when we decide
+// to hint unambiguously where there is a dominant type. We don't put the
+// AllocationType in the ContextTotalSize struct as it isn't needed there
+// during the LTO step, because due to context trimming a summarized
+// context with its allocation type can correspond to multiple context/size
+// pairs. Here the redundancy is a short-lived convenience.
+using ContextSizeTypePair = std::pair<ContextTotalSize, AllocationType>;
+
 /// Class to build a trie of call stack contexts for a particular profiled
 /// allocation call, along with their associated allocation types.
 /// The allocation will be at the root of the trie, which is then used to
@@ -75,8 +84,9 @@ class CallStackTrie {
     // If the user has requested reporting of hinted sizes, keep track of the
     // associated full stack id and profiled sizes. Can have more than one
     // after trimming (e.g. when building from metadata). This is only placed on
-    // the last (root-most) trie node for each allocation context.
-    std::vector<ContextTotalSize> ContextSizeInfo;
+    // the last (root-most) trie node for each allocation context. Also
+    // track the original allocation type of the context.
+    std::vector<ContextSizeTypePair> ContextInfo;
     // Map of caller stack id to the corresponding child Trie node.
     std::map<uint64_t, CallStackTrieNode *> Callers;
     CallStackTrieNode(AllocationType Type)
@@ -118,10 +128,11 @@ class CallStackTrie {
     delete Node;
   }
 
-  // Recursively build up a complete list of context size information from the
-  // trie nodes reached form the given Node, for hint size reporting.
-  void collectContextSizeInfo(CallStackTrieNode *Node,
-                              std::vector<ContextTotalSize> &ContextSizeInfo);
+  // Recursively build up a complete list of context information from the
+  // trie nodes reached form the given Node, including each context's
+  // ContextTotalSize and AllocationType, for hint size reporting.
+  void collectContextInfo(CallStackTrieNode *Node,
+                          std::vector<ContextSizeTypePair> &ContextInfo);
 
   // Recursively convert hot allocation types to notcold, since we don't
   // actually do any cloning for hot contexts, to facilitate more aggressive
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index b09f4ed78ca7e..fb22a098c60fb 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -181,7 +181,13 @@ void CallStackTrie::addCallStack(
     Curr = New;
   }
   assert(Curr);
-  llvm::append_range(Curr->ContextSizeInfo, ContextSizeInfo);
+  // Append all of the ContextSizeInfo, along with their original AllocType.
+  llvm::append_range(Curr->ContextInfo,
+                     llvm::map_range(ContextSizeInfo,
+                                     [AllocType](const ContextTotalSize &CTS) {
+                                       return ContextSizeTypePair(CTS,
+                                                                  AllocType);
+                                     }));
 }
 
 void CallStackTrie::addCallStack(MDNode *MIB) {
@@ -216,7 +222,7 @@ void CallStackTrie::addCallStack(MDNode *MIB) {
 
 static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
                              AllocationType AllocType,
-                             ArrayRef<ContextTotalSize> ContextSizeInfo,
+                             ArrayRef<ContextSizeTypePair> ContextInfo,
                              const uint64_t MaxColdSize,
                              bool BuiltFromExistingMetadata,
                              uint64_t &TotalBytes, uint64_t &ColdBytes) {
@@ -225,7 +231,7 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
   MIBPayload.push_back(
       MDString::get(Ctx, getAllocTypeAttributeString(AllocType)));
 
-  if (ContextSizeInfo.empty()) {
+  if (ContextInfo.empty()) {
     // The profile matcher should have provided context size info if there was a
     // MinCallsiteColdBytePercent < 100. Here we check >=100 to gracefully
     // handle a user-provided percent larger than 100. However, we may not have
@@ -234,7 +240,8 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
     return MDNode::get(Ctx, MIBPayload);
   }
 
-  for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) {
+  for (const auto &[CSI, AT] : ContextInfo) {
+    const auto &[FullStackId, TotalSize] = CSI;
     TotalBytes += TotalSize;
     bool LargeColdContext = false;
     if (AllocType == AllocationType::Cold) {
@@ -267,11 +274,11 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
   return MDNode::get(Ctx, MIBPayload);
 }
 
-void CallStackTrie::collectContextSizeInfo(
-    CallStackTrieNode *Node, std::vector<ContextTotalSize> &ContextSizeInfo) {
-  llvm::append_range(ContextSizeInfo, Node->ContextSizeInfo);
+void CallStackTrie::collectContextInfo(
+    CallStackTrieNode *Node, std::vector<ContextSizeTypePair> &ContextInfo) {
+  llvm::append_range(ContextInfo, Node->ContextInfo);
   for (auto &Caller : Node->Callers)
-    collectContextSizeInfo(Caller.second, ContextSizeInfo);
+    collectContextInfo(Caller.second, ContextInfo);
 }
 
 void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) {
@@ -283,6 +290,17 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) {
     convertHotToNotCold(Caller.second);
 }
 
+// Helper to emit messages for non-cold contexts that are ignored for various
+// reasons when reporting of hinted bytes is enabled.
+static void emitIgnoredNonColdContextMessage(StringRef Tag,
+                                             uint64_t FullStackId,
+                                             StringRef Extra,
+                                             uint64_t TotalSize) {
+  errs() << "MemProf hinting: Total size for " << Tag
+         << " non-cold full allocation context hash " << FullStackId << Extra
+         << ": " << TotalSize << "\n";
+}
+
 // Copy over some or all of NewMIBNodes to the SavedMIBNodes vector, depending
 // on options that enable filtering out some NotCold contexts.
 static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
@@ -321,9 +339,7 @@ static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
       uint64_t TS =
           mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(1))
               ->getZExtValue();
-      errs() << "MemProf hinting: Total size for " << Tag
-             << " non-cold full allocation context hash " << FullStackId
-             << Extra << ": " << TS << "\n";
+      emitIgnoredNonColdContextMessage(Tag, FullStackId, Extra, TS);
     }
   };
 
@@ -430,10 +446,10 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
   // Trim context below the first node in a prefix with a single alloc type.
   // Add an MIB record for the current call stack prefix.
   if (hasSingleAllocType(Node->AllocTypes)) {
-    std::vector<ContextTotalSize> ContextSizeInfo;
-    collectContextSizeInfo(Node, ContextSizeInfo);
+    std::vector<ContextSizeTypePair> ContextInfo;
+    collectContextInfo(Node, ContextInfo);
     MIBNodes.push_back(createMIBNode(
-        Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo,
+        Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextInfo,
         MaxColdSize, BuiltFromExistingMetadata, TotalBytes, ColdBytes));
     return true;
   }
@@ -486,10 +502,10 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
   // non-cold allocation type.
   if (!CalleeHasAmbiguousCallerContext)
     return false;
-  std::vector<ContextTotalSize> ContextSizeInfo;
-  collectContextSizeInfo(Node, ContextSizeInfo);
+  std::vector<ContextSizeTypePair> ContextInfo;
+  collectContextInfo(Node, ContextInfo);
   MIBNodes.push_back(createMIBNode(
-      Ctx, MIBCallStack, AllocationType::NotCold, ContextSizeInfo, MaxColdSize,
+      Ctx, MIBCallStack, AllocationType::NotCold, ContextInfo, MaxColdSize,
       BuiltFromExistingMetadata, TotalBytes, ColdBytes));
   return true;
 }
@@ -503,9 +519,16 @@ void CallStackTrie::addSingleAllocTypeAttribute(CallBase *CI, AllocationType AT,
   removeAnyExistingAmbiguousAttribute(CI);
   CI->addFnAttr(A);
   if (MemProfReportHintedSizes) {
-    std::vector<ContextTotalSize> ContextSizeInfo;
-    collectContextSizeInfo(Alloc, ContextSizeInfo);
-    for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) {
+    std::vector<ContextSizeTypePair> ContextInfo;
+    collectContextInfo(Alloc, ContextInfo);
+    for (const auto &[CSI, OrigAT] : ContextInfo) {
+      const auto &[FullStackId, TotalSize] = CSI;
+      // If the original alloc type is not the one being applied as the hint,
+      // report that we ignored this context.
+      if (AT != OrigAT) {
+        emitIgnoredNonColdContextMessage("ignored", FullStackId, "", TotalSize);
+        continue;
+      }
       errs() << "MemProf hinting: Total size for full allocation context hash "
              << FullStackId << " and " << Descriptor << " alloc type "
              << getAllocTypeAttributeString(AT) << ": " << TotalSize << "\n";
diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index f6a89a8ceb86a..a1f0f1d403c8f 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -400,10 +400,10 @@ for.end:                                          ; preds = %for.cond
 ;; with the full allocation context hash, type, and size in bytes.
 ; TOTALSIZESTHRESH60: Total size for full allocation context hash 8525406123785421946 and dominant alloc type cold: 10
 ; TOTALSIZESTHRESH60: Total size for full allocation context hash 11714230664165068698 and dominant alloc type cold: 10
-; TOTALSIZESTHRESH60: Total size for full allocation context hash 5725971306423925017 and dominant alloc type cold: 10
+; TOTALSIZESTHRESH60: Total size for ignored non-cold full allocation context hash 5725971306423925017: 10
 ; TOTALSIZESTHRESH60: Total size for full allocation context hash 16342802530253093571 and dominant alloc type cold: 10
 ; TOTALSIZESTHRESH60: Total size for full allocation context hash 18254812774972004394 and dominant alloc type cold: 10
-; TOTALSIZESTHRESH60: Total size for full allocation context hash 1093248920606587996 and dominant alloc type cold: 10
+; TOTALSIZESTHRESH60: Total size for ignored non-cold full allocation context hash 1093248920606587996: 10
 ; TOTALSIZESSINGLE: Total size for full allocation context hash 6792096022461663180 and single alloc type notcold: 10
 ; REMARKSINGLE: remark: memprof.cc:25:13: call in function main marked with memprof allocation attribute notcold
 ; TOTALSIZESSINGLE: Total size for full allocation context hash 15737101490731057601 and single alloc type cold: 10

>From 60e5b86052ca1103499e87ee32c8228c71b6c753 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 26 Dec 2025 21:36:34 +0000
Subject: [PATCH 10/34] [VPlan] Support extends and truncs in
 getSCEVExprForVPValue. (NFCI)

Handle extends and truncates in getSCEVExprForVPValue. This enables
computing SCEVs in more cases in the VPlan-based cost-model, but should
compute the matching costs in all cases.
---
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 22 ++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 165eac7d19a56..8b373f6d585c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VPlanUtils.h"
+#include "VPlanAnalysis.h"
 #include "VPlanCFG.h"
 #include "VPlanDominatorTree.h"
 #include "VPlanPatternMatch.h"
@@ -116,6 +117,27 @@ const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
     return CreateSCEV({LHSVal, RHSVal}, [&](ArrayRef<const SCEV *> Ops) {
       return SE.getMinusSCEV(Ops[0], Ops[1], SCEV::FlagAnyWrap, 0);
     });
+  if (match(V, m_Trunc(m_VPValue(LHSVal)))) {
+    const VPlan *Plan = V->getDefiningRecipe()->getParent()->getPlan();
+    Type *DestTy = VPTypeAnalysis(*Plan).inferScalarType(V);
+    return CreateSCEV({LHSVal}, [&](ArrayRef<const SCEV *> Ops) {
+      return SE.getTruncateExpr(Ops[0], DestTy);
+    });
+  }
+  if (match(V, m_ZExt(m_VPValue(LHSVal)))) {
+    const VPlan *Plan = V->getDefiningRecipe()->getParent()->getPlan();
+    Type *DestTy = VPTypeAnalysis(*Plan).inferScalarType(V);
+    return CreateSCEV({LHSVal}, [&](ArrayRef<const SCEV *> Ops) {
+      return SE.getZeroExtendExpr(Ops[0], DestTy);
+    });
+  }
+  if (match(V, m_SExt(m_VPValue(LHSVal)))) {
+    const VPlan *Plan = V->getDefiningRecipe()->getParent()->getPlan();
+    Type *DestTy = VPTypeAnalysis(*Plan).inferScalarType(V);
+    return CreateSCEV({LHSVal}, [&](ArrayRef<const SCEV *> Ops) {
+      return SE.getSignExtendExpr(Ops[0], DestTy);
+    });
+  }
 
   // TODO: Support constructing SCEVs for more recipes as needed.
   const VPRecipeBase *DefR = V->getDefiningRecipe();

>From c9eb572b141746426560c650fadf9ab75cca9867 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=AD=90=E6=98=82?= <2802328816 at qq.com>
Date: Sat, 27 Dec 2025 05:57:57 +0800
Subject: [PATCH 11/34] [LoopVectorize] Support vectorization of frexp
 intrinsic (#172957)

This patch enables the vectorization of the llvm.frexp intrinsic.
Following the suggestion in #112408, frexp is moved from
isTriviallyScalarizable to isTriviallyVectorizable.

Fixes #112408
---
 llvm/lib/Analysis/VectorUtils.cpp             |  4 +-
 .../Vectorize/LoopVectorizationLegality.cpp   | 14 +---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  7 +-
 .../multiple-result-intrinsics.ll             | 73 ++++++++++++++++++-
 .../Transforms/LoopVectorize/struct-return.ll | 35 +++++----
 5 files changed, 98 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 933e63e956f63..99e9ecb01dd75 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::exp:
   case Intrinsic::exp10:
   case Intrinsic::exp2:
+  case Intrinsic::frexp:
   case Intrinsic::ldexp:
   case Intrinsic::log:
   case Intrinsic::log10:
@@ -129,10 +130,7 @@ bool llvm::isTriviallyScalarizable(Intrinsic::ID ID,
   if (TTI && Intrinsic::isTargetIntrinsic(ID))
     return TTI->isTargetIntrinsicTriviallyScalarizable(ID);
 
-  // TODO: Move frexp to isTriviallyVectorizable.
-  // https://github.com/llvm/llvm-project/issues/112408
   switch (ID) {
-  case Intrinsic::frexp:
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 5238a5d7d7c24..93229ea625a5d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -800,18 +800,6 @@ static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
   return Scalarize;
 }
 
-/// Returns true if the call return type `Ty` can be widened by the loop
-/// vectorizer.
-static bool canWidenCallReturnType(Type *Ty) {
-  auto *StructTy = dyn_cast<StructType>(Ty);
-  // TODO: Remove the homogeneous types restriction. This is just an initial
-  // simplification. When we want to support things like the overflow intrinsics
-  // we will have to lift this restriction.
-  if (StructTy && !StructTy->containsHomogeneousTypes())
-    return false;
-  return canVectorizeTy(StructTy);
-}
-
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
   bool Result = true;
@@ -1026,7 +1014,7 @@ bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
     // For now, we only recognize struct values returned from calls where
     // all users are extractvalue as vectorizable. All element types of the
     // struct must be types that can be widened.
-    return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
+    return isa<CallInst>(Inst) && canVectorizeTy(InstTy) &&
            all_of(Inst.users(), IsaPred<ExtractValueInst>);
   };
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 40b8ce999545c..c425507eb2641 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1695,8 +1695,11 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
 
   SmallVector<Type *, 2> TysForDecl;
   // Add return type if intrinsic is overloaded on it.
-  if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
-    TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
+  if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
+                                             State.TTI)) {
+    Type *RetTy = toVectorizedTy(getResultType(), State.VF);
+    append_range(TysForDecl, getContainedTypes(RetTy));
+  }
   SmallVector<Value *, 4> Args;
   for (const auto &I : enumerate(operands())) {
     // Some intrinsics have a scalar argument - don't replace it with a
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/multiple-result-intrinsics.ll
index b19f9c5a3b60d..c6fcbed983d3c 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-result-intrinsics.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|modf|extract|store)" --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|frexp|modf|extract|store)" --version 5
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s -S -o - | FileCheck %s
 
 define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
@@ -277,3 +277,74 @@ exit:
   ret void
 }
 
+define void @frexp_f32(ptr noalias %in, ptr noalias writeonly %out_mantissa, ptr noalias writeonly %out_exponent) {
+; CHECK-LABEL: define void @frexp_f32(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_MANTISSA:%.*]], ptr noalias writeonly [[OUT_EXPONENT:%.*]]) {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:  [[EXIT:.*:]]
+; CHECK:    [[TMP1:%.*]] = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x i32> } [[TMP1]], 0
+; CHECK:    [[TMP3:%.*]] = extractvalue { <2 x float>, <2 x i32> } [[TMP1]], 1
+; CHECK:    store <2 x float> [[TMP2]], ptr [[TMP4:%.*]], align 4
+; CHECK:    store <2 x i32> [[TMP3]], ptr [[TMP5:%.*]], align 4
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[EXIT1:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, i32 } @llvm.frexp.f32.i32(float %in_val)
+  %mantissa = extractvalue { float, i32 } %call, 0
+  %exponent = extractvalue { float, i32 } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_mantissa, i64 %iv
+  store float %mantissa, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %out_exponent, i64 %iv
+  store i32 %exponent, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @frexp_f64(ptr noalias %in, ptr noalias writeonly %out_mantissa, ptr noalias writeonly %out_exponent) {
+; CHECK-LABEL: define void @frexp_f64(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_MANTISSA:%.*]], ptr noalias writeonly [[OUT_EXPONENT:%.*]]) {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:  [[EXIT:.*:]]
+; CHECK:    [[TMP1:%.*]] = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x i32> } [[TMP1]], 0
+; CHECK:    [[TMP3:%.*]] = extractvalue { <2 x double>, <2 x i32> } [[TMP1]], 1
+; CHECK:    store <2 x double> [[TMP2]], ptr [[TMP4:%.*]], align 8
+; CHECK:    store <2 x i32> [[TMP3]], ptr [[TMP5:%.*]], align 4
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[EXIT1:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+  %in_val = load double, ptr %arrayidx, align 8
+  %call = tail call { double, i32 } @llvm.frexp.f64.i32(double %in_val)
+  %mantissa = extractvalue { double, i32 } %call, 0
+  %exponent = extractvalue { double, i32 } %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_mantissa, i64 %iv
+  store double %mantissa, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds i32, ptr %out_exponent, i64 %iv
+  store i32 %exponent, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 70c6c7e900c51..83c87f1e15e8f 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -378,27 +378,30 @@ exit:
   ret void
 }
 
-; Negative test. Widening structs with mixed element types is not supported.
-; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
-define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
-; CHECK-LABEL: define void @negative_mixed_element_type_struct_return(
+; CHECK-REMARKS: remark: {{.*}} vectorized loop (vectorization width: 2, interleaved count: 1)
+define void @mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @mixed_element_type_struct_return(
 ; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]]
-; CHECK-NEXT:    [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call { float, i32 } @baz(float [[IN_VAL]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue { float, i32 } [[CALL]], 0
-; CHECK-NEXT:    [[EXTRACT_B:%.*]] = extractvalue { float, i32 } [[CALL]], 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call { <2 x float>, <2 x i32> } @fixed_vec_baz(<2 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x i32> } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <2 x float>, <2 x i32> } [[TMP1]], 1
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]]
-; CHECK-NEXT:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[OUT_B]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 2
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;

>From 6ccf97674b2deaa03e271725306b18a712a56113 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sat, 27 Dec 2025 10:19:03 +0900
Subject: [PATCH 12/34] [bazel] configure.bzl: Disable `Xtensa` (#173073) by
 default.

It hasn't been built by default in CMake side.
---
 utils/bazel/configure.bzl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl
index 247a3682736e9..09861b2f00911 100644
--- a/utils/bazel/configure.bzl
+++ b/utils/bazel/configure.bzl
@@ -25,7 +25,6 @@ DEFAULT_TARGETS = [
     "WebAssembly",
     "X86",
     "XCore",
-    "Xtensa",
 ]
 
 MAX_TRAVERSAL_STEPS = 1000000  # "big number" upper bound on total visited dirs

>From 08debd7f44614f3b8f8a5b62fe80cc051ce17918 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sat, 27 Dec 2025 13:27:00 +0900
Subject: [PATCH 13/34] [compiler-rt][builtins] Add `-Wno-c2y-extensions`

Since #162662, `__COUNTER__` has caused warnings.
---
 compiler-rt/cmake/builtin-config-ix.cmake | 1 +
 compiler-rt/lib/builtins/CMakeLists.txt   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index fd815f841fec0..d5e41fc0a1fa8 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -18,6 +18,7 @@ builtin_check_c_compiler_flag(-fno-lto              COMPILER_RT_HAS_FNO_LTO_FLAG
 builtin_check_c_compiler_flag(-fno-profile-generate COMPILER_RT_HAS_FNO_PROFILE_GENERATE_FLAG)
 builtin_check_c_compiler_flag(-fno-profile-instr-generate COMPILER_RT_HAS_FNO_PROFILE_INSTR_GENERATE_FLAG)
 builtin_check_c_compiler_flag(-fno-profile-instr-use COMPILER_RT_HAS_FNO_PROFILE_INSTR_USE_FLAG)
+builtin_check_c_compiler_flag(-Wno-c2y-extensions   COMPILER_RT_HAS_WNO_C2Y_EXTENSIONS)
 builtin_check_c_compiler_flag(-Wno-pedantic         COMPILER_RT_HAS_WNO_PEDANTIC)
 builtin_check_c_compiler_flag(-nogpulib             COMPILER_RT_HAS_NOGPULIB_FLAG)
 builtin_check_c_compiler_flag(-flto                 COMPILER_RT_HAS_FLTO_FLAG)
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index a5ce9264d1c6b..107e48b84b2b7 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -1094,6 +1094,7 @@ if (COMPILER_RT_BUILD_CRT)
   append_list_if(COMPILER_RT_HAS_INITFINI_ARRAY -DCRT_HAS_INITFINI_ARRAY CRT_CFLAGS)
   append_list_if(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY -DEH_USE_FRAME_REGISTRY CRT_CFLAGS)
   append_list_if(COMPILER_RT_HAS_FPIC_FLAG -fPIC CRT_CFLAGS)
+  append_list_if(COMPILER_RT_HAS_WNO_C2Y_EXTENSIONS -Wno-c2y-extensions CRT_CFLAGS)
   append_list_if(COMPILER_RT_HAS_WNO_PEDANTIC -Wno-pedantic CRT_CFLAGS)
   if (COMPILER_RT_HAS_FCF_PROTECTION_FLAG)
     append_list_if(COMPILER_RT_ENABLE_CET -fcf-protection=full CRT_CFLAGS)

>From b9ada74a667c07456ea51e5a2d75788a421c5950 Mon Sep 17 00:00:00 2001
From: mitchell <mitchell.xu2 at gmail.com>
Date: Sat, 27 Dec 2025 15:57:56 +0800
Subject: [PATCH 14/34] [Github][CI] Trigger `code-lint` for clang-tidy
 documentations (#173700)

Previously we added `doc8` to `code-lint` workflow. However, PRs contain
only documentation changes won't trigger this workflow.

An example: https://github.com/llvm/llvm-project/pull/173699/checks
didn't trigger `code-lint`.

This commit fixes the issue.
---
 .github/workflows/pr-code-lint.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml
index 64fecd26af004..b314f740e59de 100644
--- a/.github/workflows/pr-code-lint.yml
+++ b/.github/workflows/pr-code-lint.yml
@@ -10,6 +10,7 @@ on:
       - 'users/**'
     paths:
       - 'clang-tools-extra/clang-tidy/**'
+      - 'clang-tools-extra/docs/clang-tidy/**'
       - '.github/workflows/pr-code-lint.yml'
 
 jobs:

>From 2c13075b2956819dc5a562f14a48d027537cf2e5 Mon Sep 17 00:00:00 2001
From: lonely eagle <2020382038 at qq.com>
Date: Sat, 27 Dec 2025 17:39:06 +0800
Subject: [PATCH 15/34] [mlir][dataflow] Fix DataFlowFramework crash by add
 isBlockEnd logic in the ProgramPoint::print (#173471)

Running -test-dead-code-analysis -debug on the following IR will trigger
a data-flow analysis framework assert, you can see
https://github.com/llvm/llvm-project/blob/2d6b1b174194198498eb10ae811632b3dd945ecf/mlir/include/mlir/Analysis/DataFlowFramework.h#L110
Fix DataFlowFramework crash by add isBlockEnd logic in the
ProgramPoint::print.
```
func.func @trs(%idx1: index, %idx2: index, %s: f32) {
  scf.parallel (%i) = (%idx1) to (%idx2) step (%idx2) {
    %r = memref.alloca() : memref<10xf32>
    scf.forall (%e2) in (%idx2) {
      %a = memref.load %r[%idx2] : memref<10xf32>
    }
  }
  return
}
```
---
 mlir/lib/Analysis/DataFlowFramework.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Analysis/DataFlowFramework.cpp b/mlir/lib/Analysis/DataFlowFramework.cpp
index 9352ab02f7472..36b87bd5bb838 100644
--- a/mlir/lib/Analysis/DataFlowFramework.cpp
+++ b/mlir/lib/Analysis/DataFlowFramework.cpp
@@ -67,8 +67,12 @@ void ProgramPoint::print(raw_ostream &os) const {
        << OpWithFlags(getPrevOp(), OpPrintingFlags().skipRegions());
     return;
   }
-  os << "<before operation>:"
-     << OpWithFlags(getNextOp(), OpPrintingFlags().skipRegions());
+  if (!isBlockEnd()) {
+    os << "<before operation>:"
+       << OpWithFlags(getNextOp(), OpPrintingFlags().skipRegions());
+    return;
+  }
+  os << "<beginning of empty block>";
 }
 
 //===----------------------------------------------------------------------===//

>From 26cc61ef2f53ed0e90f328fe2a3a1505ecb858f6 Mon Sep 17 00:00:00 2001
From: aokblast <aokblast at FreeBSD.org>
Date: Sat, 27 Dec 2025 20:16:53 +0800
Subject: [PATCH 16/34] [lit] Disable ulimit-nodarwin test on FreeBSD (#173155)

FreeBSD does not support using ulimit to grow up max file number per
process. This characteristic is inherited by Darwin and thus we pass
this test on FreeBSD as well.
---
 llvm/utils/lit/tests/shtest-ulimit-nondarwin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
index d5340a7d2efb9..be2627be366ed 100644
--- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
+++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
@@ -2,7 +2,7 @@
 
 # ulimit does not work on non-POSIX platforms.
 # These tests are specific to options that Darwin does not support.
-# UNSUPPORTED: system-windows, system-cygwin, system-darwin, system-aix, system-solaris
+# UNSUPPORTED: system-windows, system-cygwin, system-darwin, system-aix, system-solaris, system-freebsd
 
 # RUN: not %{lit} -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s
 

>From 5aee01a3df011e660f26660bc30a8c94a1651d8e Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sat, 27 Dec 2025 21:35:51 +0900
Subject: [PATCH 17/34] [compiler-rt] Rework 08debd7f4461 [-Wno-c2y-extensions]

---
 compiler-rt/lib/builtins/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 107e48b84b2b7..631e1b712ec8c 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -906,6 +906,9 @@ else ()
   append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ BUILTIN_CFLAGS)
   append_list_if(COMPILER_RT_HAS_WBUILTIN_DECLARATION_MISMATCH_FLAG -Werror=builtin-declaration-mismatch BUILTIN_CFLAGS)
 
+  # Builtins use __COUNTER__
+  append_list_if(COMPILER_RT_HAS_WNO_C2Y_EXTENSIONS -Wno-c2y-extensions BUILTIN_CFLAGS)
+
   # Don't embed directives for picking any specific CRT
   if (MSVC)
     set(CMAKE_MSVC_RUNTIME_LIBRARY "")
@@ -1094,7 +1097,6 @@ if (COMPILER_RT_BUILD_CRT)
   append_list_if(COMPILER_RT_HAS_INITFINI_ARRAY -DCRT_HAS_INITFINI_ARRAY CRT_CFLAGS)
   append_list_if(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY -DEH_USE_FRAME_REGISTRY CRT_CFLAGS)
   append_list_if(COMPILER_RT_HAS_FPIC_FLAG -fPIC CRT_CFLAGS)
-  append_list_if(COMPILER_RT_HAS_WNO_C2Y_EXTENSIONS -Wno-c2y-extensions CRT_CFLAGS)
   append_list_if(COMPILER_RT_HAS_WNO_PEDANTIC -Wno-pedantic CRT_CFLAGS)
   if (COMPILER_RT_HAS_FCF_PROTECTION_FLAG)
     append_list_if(COMPILER_RT_ENABLE_CET -fcf-protection=full CRT_CFLAGS)

>From 87e8e7d8f0db53060ef2f6ef4ab612fc0f2b4490 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron at google.com>
Date: Sat, 27 Dec 2025 09:59:53 -0500
Subject: [PATCH 18/34] [SPIRV] Implement lowering for llvm.matrix.transpose
 and llvm.matrix.multiply (#172050)

This patch implements the lowering for the llvm.matrix.transpose and
llvm.matrix.multiply intrinsics in the SPIR-V backend.

- llvm.matrix.transpose is lowered to a G_SHUFFLE_VECTOR with a
  mask calculated to transpose the elements.
- llvm.matrix.multiply is lowered by decomposing the operation into
  dot products of rows and columns:
  - Rows and columns are extracted using G_UNMERGE_VALUES or shuffles.
  - Dot products are computed using OpDot for floating point vectors
    or standard arithmetic for scalars/integers.
  - The result is reconstructed using G_BUILD_VECTOR.

This change also updates SPIRVPostLegalizer to improve type deduction
for G_UNMERGE_VALUES, enabling correct type assignment for the
intermediate virtual registers generated during lowering.

New tests are added to verify support for various matrix sizes and
element types (float and int).
---
 llvm/lib/Target/SPIRV/SPIRVCombine.td         |  18 +-
 llvm/lib/Target/SPIRV/SPIRVCombinerHelper.cpp | 189 ++++++++++++++++++
 llvm/lib/Target/SPIRV/SPIRVCombinerHelper.h   |  21 ++
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |   7 +-
 llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp  |  95 +++++----
 .../SPIRV/llvm-intrinsics/matrix-multiply.ll  | 168 ++++++++++++++++
 .../SPIRV/llvm-intrinsics/matrix-transpose.ll | 124 ++++++++++++
 7 files changed, 567 insertions(+), 55 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/llvm-intrinsics/matrix-multiply.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/llvm-intrinsics/matrix-transpose.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVCombine.td b/llvm/lib/Target/SPIRV/SPIRVCombine.td
index 991a5de1c4e83..7d69465de4ffb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCombine.td
+++ b/llvm/lib/Target/SPIRV/SPIRVCombine.td
@@ -22,8 +22,22 @@ def vector_select_to_faceforward_lowering : GICombineRule <
   (apply [{ Helper.applySPIRVFaceForward(*${root}); }])
 >;
 
+def matrix_transpose_lowering
+    : GICombineRule<(defs root:$root),
+                    (match (wip_match_opcode G_INTRINSIC):$root,
+                        [{ return Helper.matchMatrixTranspose(*${root}); }]),
+                    (apply [{ Helper.applyMatrixTranspose(*${root}); }])>;
+
+def matrix_multiply_lowering
+    : GICombineRule<(defs root:$root),
+                    (match (wip_match_opcode G_INTRINSIC):$root,
+                        [{ return Helper.matchMatrixMultiply(*${root}); }]),
+                    (apply [{ Helper.applyMatrixMultiply(*${root}); }])>;
+
 def SPIRVPreLegalizerCombiner
     : GICombiner<"SPIRVPreLegalizerCombinerImpl",
-                       [vector_length_sub_to_distance_lowering, vector_select_to_faceforward_lowering]> {
-    let CombineAllMethodName = "tryCombineAllImpl";
+                 [vector_length_sub_to_distance_lowering,
+                  vector_select_to_faceforward_lowering,
+                  matrix_transpose_lowering, matrix_multiply_lowering]> {
+  let CombineAllMethodName = "tryCombineAllImpl";
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.cpp b/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.cpp
index fad2b676fee04..693b74c1e06d7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.cpp
@@ -7,9 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "SPIRVCombinerHelper.h"
+#include "SPIRVGlobalRegistry.h"
+#include "SPIRVUtils.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/IR/LLVMContext.h" // Explicitly include for LLVMContext
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -209,3 +213,188 @@ void SPIRVCombinerHelper::applySPIRVFaceForward(MachineInstr &MI) const {
   GR->invalidateMachineInstr(FalseInstr);
   FalseInstr->eraseFromParent();
 }
+
+bool SPIRVCombinerHelper::matchMatrixTranspose(MachineInstr &MI) const {
+  return MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
+         cast<GIntrinsic>(MI).getIntrinsicID() == Intrinsic::matrix_transpose;
+}
+
+void SPIRVCombinerHelper::applyMatrixTranspose(MachineInstr &MI) const {
+  Register ResReg = MI.getOperand(0).getReg();
+  Register InReg = MI.getOperand(2).getReg();
+  uint32_t Rows = MI.getOperand(3).getImm();
+  uint32_t Cols = MI.getOperand(4).getImm();
+
+  Builder.setInstrAndDebugLoc(MI);
+
+  if (Rows == 1 && Cols == 1) {
+    Builder.buildCopy(ResReg, InReg);
+    MI.eraseFromParent();
+    return;
+  }
+
+  SmallVector<int, 16> Mask;
+  for (uint32_t K = 0; K < Rows * Cols; ++K) {
+    uint32_t R = K / Cols;
+    uint32_t C = K % Cols;
+    Mask.push_back(C * Rows + R);
+  }
+
+  Builder.buildShuffleVector(ResReg, InReg, InReg, Mask);
+  MI.eraseFromParent();
+}
+
+bool SPIRVCombinerHelper::matchMatrixMultiply(MachineInstr &MI) const {
+  return MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
+         cast<GIntrinsic>(MI).getIntrinsicID() == Intrinsic::matrix_multiply;
+}
+
+SmallVector<Register, 4>
+SPIRVCombinerHelper::extractColumns(Register MatrixReg, uint32_t NumberOfCols,
+                                    SPIRVType *SpvColType,
+                                    SPIRVGlobalRegistry *GR) const {
+  // If the matrix is a single colunm, return that single column.
+  if (NumberOfCols == 1)
+    return {MatrixReg};
+
+  SmallVector<Register, 4> Cols;
+  LLT ColTy = GR->getRegType(SpvColType);
+  for (uint32_t J = 0; J < NumberOfCols; ++J)
+    Cols.push_back(MRI.createGenericVirtualRegister(ColTy));
+  Builder.buildUnmerge(Cols, MatrixReg);
+  for (Register R : Cols) {
+    setRegClassType(R, SpvColType, GR, &MRI, Builder.getMF());
+  }
+  return Cols;
+}
+
+SmallVector<Register, 4>
+SPIRVCombinerHelper::extractRows(Register MatrixReg, uint32_t NumRows,
+                                 uint32_t NumCols, SPIRVType *SpvRowType,
+                                 SPIRVGlobalRegistry *GR) const {
+  SmallVector<Register, 4> Rows;
+  LLT VecTy = GR->getRegType(SpvRowType);
+
+  // If there is only one column, then each row is a scalar that needs
+  // to be extracted.
+  if (NumCols == 1) {
+    assert(SpvRowType->getOpcode() != SPIRV::OpTypeVector);
+    for (uint32_t I = 0; I < NumRows; ++I)
+      Rows.push_back(MRI.createGenericVirtualRegister(VecTy));
+    Builder.buildUnmerge(Rows, MatrixReg);
+    for (Register R : Rows) {
+      setRegClassType(R, SpvRowType, GR, &MRI, Builder.getMF());
+    }
+    return Rows;
+  }
+
+  // If the matrix is a single row return that row.
+  if (NumRows == 1) {
+    return {MatrixReg};
+  }
+
+  for (uint32_t I = 0; I < NumRows; ++I) {
+    SmallVector<int, 4> Mask;
+    for (uint32_t k = 0; k < NumCols; ++k)
+      Mask.push_back(k * NumRows + I);
+    Rows.push_back(Builder.buildShuffleVector(VecTy, MatrixReg, MatrixReg, Mask)
+                       .getReg(0));
+  }
+  for (Register R : Rows) {
+    setRegClassType(R, SpvRowType, GR, &MRI, Builder.getMF());
+  }
+  return Rows;
+}
+
+Register SPIRVCombinerHelper::computeDotProduct(Register RowA, Register ColB,
+                                                SPIRVType *SpvVecType,
+                                                SPIRVGlobalRegistry *GR) const {
+  bool IsVectorOp = SpvVecType->getOpcode() == SPIRV::OpTypeVector;
+  SPIRVType *SpvScalarType = GR->getScalarOrVectorComponentType(SpvVecType);
+  bool IsFloatOp = SpvScalarType->getOpcode() == SPIRV::OpTypeFloat;
+  LLT VecTy = GR->getRegType(SpvVecType);
+
+  Register DotRes;
+  if (IsVectorOp) {
+    LLT ScalarTy = VecTy.getElementType();
+    Intrinsic::SPVIntrinsics DotIntrinsic =
+        (IsFloatOp ? Intrinsic::spv_fdot : Intrinsic::spv_udot);
+    DotRes = Builder.buildIntrinsic(DotIntrinsic, {ScalarTy})
+                 .addUse(RowA)
+                 .addUse(ColB)
+                 .getReg(0);
+  } else {
+    if (IsFloatOp)
+      DotRes = Builder.buildFMul(VecTy, RowA, ColB).getReg(0);
+    else
+      DotRes = Builder.buildMul(VecTy, RowA, ColB).getReg(0);
+  }
+  setRegClassType(DotRes, SpvScalarType, GR, &MRI, Builder.getMF());
+  return DotRes;
+}
+
+SmallVector<Register, 16>
+SPIRVCombinerHelper::computeDotProducts(const SmallVector<Register, 4> &RowsA,
+                                        const SmallVector<Register, 4> &ColsB,
+                                        SPIRVType *SpvVecType,
+                                        SPIRVGlobalRegistry *GR) const {
+  SmallVector<Register, 16> ResultScalars;
+  for (uint32_t J = 0; J < ColsB.size(); ++J) {
+    for (uint32_t I = 0; I < RowsA.size(); ++I) {
+      ResultScalars.push_back(
+          computeDotProduct(RowsA[I], ColsB[J], SpvVecType, GR));
+    }
+  }
+  return ResultScalars;
+}
+
+SPIRVType *
+SPIRVCombinerHelper::getDotProductVectorType(Register ResReg, uint32_t K,
+                                             SPIRVGlobalRegistry *GR) const {
+  // Loop over all non debug uses of ResReg
+  Type *ScalarResType = nullptr;
+  for (auto &UseMI : MRI.use_instructions(ResReg)) {
+    if (UseMI.getOpcode() != TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
+      continue;
+
+    if (!isSpvIntrinsic(UseMI, Intrinsic::spv_assign_type))
+      continue;
+
+    Type *Ty = getMDOperandAsType(UseMI.getOperand(2).getMetadata(), 0);
+    if (Ty->isVectorTy())
+      ScalarResType = cast<VectorType>(Ty)->getElementType();
+    else
+      ScalarResType = Ty;
+    assert(ScalarResType->isIntegerTy() || ScalarResType->isFloatingPointTy());
+    break;
+  }
+  Type *VecType =
+      (K > 1 ? FixedVectorType::get(ScalarResType, K) : ScalarResType);
+  return GR->getOrCreateSPIRVType(VecType, Builder,
+                                  SPIRV::AccessQualifier::None, false);
+}
+
+void SPIRVCombinerHelper::applyMatrixMultiply(MachineInstr &MI) const {
+  Register ResReg = MI.getOperand(0).getReg();
+  Register AReg = MI.getOperand(2).getReg();
+  Register BReg = MI.getOperand(3).getReg();
+  uint32_t NumRowsA = MI.getOperand(4).getImm();
+  uint32_t NumColsA = MI.getOperand(5).getImm();
+  uint32_t NumColsB = MI.getOperand(6).getImm();
+
+  Builder.setInstrAndDebugLoc(MI);
+
+  SPIRVGlobalRegistry *GR =
+      MI.getMF()->getSubtarget<SPIRVSubtarget>().getSPIRVGlobalRegistry();
+
+  SPIRVType *SpvVecType = getDotProductVectorType(ResReg, NumColsA, GR);
+  SmallVector<Register, 4> ColsB =
+      extractColumns(BReg, NumColsB, SpvVecType, GR);
+  SmallVector<Register, 4> RowsA =
+      extractRows(AReg, NumRowsA, NumColsA, SpvVecType, GR);
+  SmallVector<Register, 16> ResultScalars =
+      computeDotProducts(RowsA, ColsB, SpvVecType, GR);
+
+  Builder.buildBuildVector(ResReg, ResultScalars);
+  MI.eraseFromParent();
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.h b/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.h
index 3118cdc744b8f..b6b3b36f03ade 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.h
+++ b/llvm/lib/Target/SPIRV/SPIRVCombinerHelper.h
@@ -33,6 +33,27 @@ class SPIRVCombinerHelper : public CombinerHelper {
   void applySPIRVDistance(MachineInstr &MI) const;
   bool matchSelectToFaceForward(MachineInstr &MI) const;
   void applySPIRVFaceForward(MachineInstr &MI) const;
+  bool matchMatrixTranspose(MachineInstr &MI) const;
+  void applyMatrixTranspose(MachineInstr &MI) const;
+  bool matchMatrixMultiply(MachineInstr &MI) const;
+  void applyMatrixMultiply(MachineInstr &MI) const;
+
+private:
+  SPIRVType *getDotProductVectorType(Register ResReg, uint32_t K,
+                                     SPIRVGlobalRegistry *GR) const;
+  SmallVector<Register, 4> extractColumns(Register BReg, uint32_t N,
+                                          SPIRVType *SpvVecType,
+                                          SPIRVGlobalRegistry *GR) const;
+  SmallVector<Register, 4> extractRows(Register AReg, uint32_t NumRows,
+                                       uint32_t NumCols, SPIRVType *SpvRowType,
+                                       SPIRVGlobalRegistry *GR) const;
+  SmallVector<Register, 16>
+  computeDotProducts(const SmallVector<Register, 4> &RowsA,
+                     const SmallVector<Register, 4> &ColsB,
+                     SPIRVType *SpvVecType, SPIRVGlobalRegistry *GR) const;
+  Register computeDotProduct(Register RowA, Register ColB,
+                             SPIRVType *SpvVecType,
+                             SPIRVGlobalRegistry *GR) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index d37adecbc1956..590182731b002 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -175,6 +175,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   // non-shader contexts, vector sizes of 8 and 16 are also permitted, but
   // arbitrary sizes (e.g., 6 or 11) are not.
   uint32_t MaxVectorSize = ST.isShader() ? 4 : 16;
+  LLVM_DEBUG(dbgs() << "MaxVectorSize: " << MaxVectorSize << "\n");
 
   for (auto Opc : getTypeFoldingSupportedOpcodes()) {
     switch (Opc) {
@@ -223,8 +224,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
       .moreElementsToNextPow2(0)
       .lowerIf(vectorElementCountIsGreaterThan(0, MaxVectorSize))
       .moreElementsToNextPow2(1)
-      .lowerIf(vectorElementCountIsGreaterThan(1, MaxVectorSize))
-      .alwaysLegal();
+      .lowerIf(vectorElementCountIsGreaterThan(1, MaxVectorSize));
 
   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
       .moreElementsToNextPow2(1)
@@ -265,8 +265,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
 
   // If the result is still illegal, the combiner should be able to remove it.
   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
-      .legalForCartesianProduct(allowedVectorTypes, allowedVectorTypes)
-      .moreElementsToNextPow2(0);
+      .legalForCartesianProduct(allowedVectorTypes, allowedVectorTypes);
 
   getActionDefinitionsBuilder(G_SPLAT_VECTOR)
       .legalFor(allowedVectorTypes)
diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
index 99edb937c3daa..5f52f60da37e1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
@@ -51,54 +51,6 @@ static SPIRVType *deduceIntTypeFromResult(Register ResVReg,
   return GR->getOrCreateSPIRVIntegerType(Ty.getScalarSizeInBits(), MIB);
 }
 
-static bool deduceAndAssignTypeForGUnmerge(MachineInstr *I, MachineFunction &MF,
-                                           SPIRVGlobalRegistry *GR) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  Register SrcReg = I->getOperand(I->getNumOperands() - 1).getReg();
-  SPIRVType *ScalarType = nullptr;
-  if (SPIRVType *DefType = GR->getSPIRVTypeForVReg(SrcReg)) {
-    assert(DefType->getOpcode() == SPIRV::OpTypeVector);
-    ScalarType = GR->getSPIRVTypeForVReg(DefType->getOperand(1).getReg());
-  }
-
-  if (!ScalarType) {
-    // If we could not deduce the type from the source, try to deduce it from
-    // the uses of the results.
-    for (unsigned i = 0; i < I->getNumDefs() && !ScalarType; ++i) {
-      for (const auto &Use :
-           MRI.use_nodbg_instructions(I->getOperand(i).getReg())) {
-        if (Use.getOpcode() != TargetOpcode::G_BUILD_VECTOR)
-          continue;
-
-        if (auto *VecType =
-                GR->getSPIRVTypeForVReg(Use.getOperand(0).getReg())) {
-          ScalarType = GR->getScalarOrVectorComponentType(VecType);
-          break;
-        }
-      }
-    }
-  }
-
-  if (!ScalarType)
-    return false;
-
-  for (unsigned i = 0; i < I->getNumDefs(); ++i) {
-    Register DefReg = I->getOperand(i).getReg();
-    if (GR->getSPIRVTypeForVReg(DefReg))
-      continue;
-
-    LLT DefLLT = MRI.getType(DefReg);
-    SPIRVType *ResType =
-        DefLLT.isVector()
-            ? GR->getOrCreateSPIRVVectorType(
-                  ScalarType, DefLLT.getNumElements(), *I,
-                  *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo())
-            : ScalarType;
-    setRegClassType(DefReg, ResType, GR, &MRI, MF);
-  }
-  return true;
-}
-
 static SPIRVType *deduceTypeFromSingleOperand(MachineInstr *I,
                                               MachineIRBuilder &MIB,
                                               SPIRVGlobalRegistry *GR,
@@ -179,6 +131,7 @@ static SPIRVType *deduceTypeFromUses(Register Reg, MachineFunction &MF,
     case TargetOpcode::G_FDIV:
     case TargetOpcode::G_FREM:
     case TargetOpcode::G_FMA:
+    case TargetOpcode::COPY:
     case TargetOpcode::G_STRICT_FMA:
       ResType = deduceTypeFromResultRegister(&Use, Reg, GR, MIB);
       break;
@@ -223,6 +176,50 @@ static SPIRVType *deduceResultTypeFromOperands(MachineInstr *I,
   }
 }
 
+static bool deduceAndAssignTypeForGUnmerge(MachineInstr *I, MachineFunction &MF,
+                                           SPIRVGlobalRegistry *GR,
+                                           MachineIRBuilder &MIB) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register SrcReg = I->getOperand(I->getNumOperands() - 1).getReg();
+  SPIRVType *ScalarType = nullptr;
+  if (SPIRVType *DefType = GR->getSPIRVTypeForVReg(SrcReg)) {
+    assert(DefType->getOpcode() == SPIRV::OpTypeVector);
+    ScalarType = GR->getSPIRVTypeForVReg(DefType->getOperand(1).getReg());
+  }
+
+  if (!ScalarType) {
+    // If we could not deduce the type from the source, try to deduce it from
+    // the uses of the results.
+    for (unsigned i = 0; i < I->getNumDefs(); ++i) {
+      Register DefReg = I->getOperand(i).getReg();
+      ScalarType = deduceTypeFromUses(DefReg, MF, GR, MIB);
+      if (ScalarType) {
+        ScalarType = GR->getScalarOrVectorComponentType(ScalarType);
+        break;
+      }
+    }
+  }
+
+  if (!ScalarType)
+    return false;
+
+  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+    Register DefReg = I->getOperand(i).getReg();
+    if (GR->getSPIRVTypeForVReg(DefReg))
+      continue;
+
+    LLT DefLLT = MRI.getType(DefReg);
+    SPIRVType *ResType =
+        DefLLT.isVector()
+            ? GR->getOrCreateSPIRVVectorType(
+                  ScalarType, DefLLT.getNumElements(), *I,
+                  *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo())
+            : ScalarType;
+    setRegClassType(DefReg, ResType, GR, &MRI, MF);
+  }
+  return true;
+}
+
 static bool deduceAndAssignSpirvType(MachineInstr *I, MachineFunction &MF,
                                      SPIRVGlobalRegistry *GR,
                                      MachineIRBuilder &MIB) {
@@ -234,7 +231,7 @@ static bool deduceAndAssignSpirvType(MachineInstr *I, MachineFunction &MF,
   // unlike the other instructions which have a single result register. The main
   // deduction logic is designed for the single-definition case.
   if (I->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
-    return deduceAndAssignTypeForGUnmerge(I, MF, GR);
+    return deduceAndAssignTypeForGUnmerge(I, MF, GR, MIB);
 
   LLVM_DEBUG(dbgs() << "Inferring type from operands\n");
   SPIRVType *ResType = deduceResultTypeFromOperands(I, GR, MIB);
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/matrix-multiply.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/matrix-multiply.ll
new file mode 100644
index 0000000000000..4f8dfd0494009
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/matrix-multiply.ll
@@ -0,0 +1,168 @@
+; RUN: llc -O0 -mtriple=spirv1.5-unknown-vulkan1.2 %s -o - | FileCheck %s --check-prefixes=CHECK,VK1_1
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-unknown-vulkan1.2 %s -o - -filetype=obj | spirv-val --target-env vulkan1.2 %}
+
+; RUN: llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3 %s -o - | FileCheck %s --check-prefixes=CHECK,VK1_3
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3 %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
+
+ at private_v4f32 = internal addrspace(10) global [4 x float] poison
+ at private_v4i32 = internal addrspace(10) global [4 x i32] poison
+ at private_v6f32 = internal addrspace(10) global [6 x float] poison
+ at private_v2f32 = internal addrspace(10) global [2 x float] poison
+ at private_v1f32 = internal addrspace(10) global [1 x float] poison
+
+
+; CHECK-DAG: %[[Float_ID:[0-9]+]] = OpTypeFloat 32
+; CHECK-DAG: %[[V2F32_ID:[0-9]+]] = OpTypeVector %[[Float_ID]] 2
+; CHECK-DAG: %[[V3F32_ID:[0-9]+]] = OpTypeVector %[[Float_ID]] 3
+; CHECK-DAG: %[[V4F32_ID:[0-9]+]] = OpTypeVector %[[Float_ID]] 4
+; CHECK-DAG: %[[Int_ID:[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: %[[V2I32_ID:[0-9]+]] = OpTypeVector %[[Int_ID]] 2
+; CHECK-DAG: %[[V4I32_ID:[0-9]+]] = OpTypeVector %[[Int_ID]] 4
+
+; Test Matrix Multiply 2x2 * 2x2 float
+; CHECK-LABEL: ; -- Begin function test_matrix_multiply_f32_2x2_2x2
+; CHECK:       %[[A:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] {{.*}} {{.*}} 3
+; CHECK:       %[[B:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] {{.*}} {{.*}} 3
+; CHECK-DAG:   %[[B_Col0:[0-9]+]] = OpVectorShuffle %[[V2F32_ID]] %[[B]] %[[#]] 0 1
+; CHECK-DAG:   %[[B_Col1:[0-9]+]] = OpVectorShuffle %[[V2F32_ID]] %[[B]] %[[#]] 2 3
+; CHECK-DAG:   %[[A_Row0:[0-9]+]] = OpVectorShuffle %[[V2F32_ID]] %[[A]] %[[A]] 0 2
+; CHECK-DAG:   %[[A_Row1:[0-9]+]] = OpVectorShuffle %[[V2F32_ID]] %[[A]] %[[A]] 1 3
+; CHECK-DAG:   %[[C00:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row0]] %[[B_Col0]]
+; CHECK-DAG:   %[[C10:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row1]] %[[B_Col0]]
+; CHECK-DAG:   %[[C01:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row0]] %[[B_Col1]]
+; CHECK-DAG:   %[[C11:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row1]] %[[B_Col1]]
+; CHECK:       OpCompositeConstruct %[[V4F32_ID]] %[[C00]] %[[C10]] %[[C01]] %[[C11]]
+define internal void @test_matrix_multiply_f32_2x2_2x2() {
+  %1 = load <4 x float>, ptr addrspace(10) @private_v4f32
+  %2 = load <4 x float>, ptr addrspace(10) @private_v4f32
+  %3 = call <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float> %1, <4 x float> %2, i32 2, i32 2, i32 2)
+  store <4 x float> %3, ptr addrspace(10) @private_v4f32
+  ret void
+}
+
+; Test Matrix Multiply 2x2 * 2x2 int
+; CHECK-LABEL: ; -- Begin function test_matrix_multiply_i32_2x2_2x2
+; CHECK:       %[[A:[0-9]+]] = OpCompositeInsert %[[V4I32_ID]] {{.*}} {{.*}} 3
+; CHECK:       %[[B:[0-9]+]] = OpCompositeInsert %[[V4I32_ID]] {{.*}} {{.*}} 3
+; CHECK-DAG:   %[[B_Col0:[0-9]+]] = OpVectorShuffle %[[V2I32_ID]] %[[B]] %[[#]] 0 1
+; CHECK-DAG:   %[[B_Col1:[0-9]+]] = OpVectorShuffle %[[V2I32_ID]] %[[B]] %[[#]] 2 3
+; CHECK-DAG:   %[[A_Row0:[0-9]+]] = OpVectorShuffle %[[V2I32_ID]] %[[A]] %[[A]] 0 2
+; CHECK-DAG:   %[[A_Row1:[0-9]+]] = OpVectorShuffle %[[V2I32_ID]] %[[A]] %[[A]] 1 3
+;
+; -- C00 = dot(A_Row0, B_Col0)
+; VK1_1-DAG:   %[[Mul00:[0-9]+]] = OpIMul %[[V2I32_ID]] %[[A_Row0]] %[[B_Col0]]
+; VK1_1-DAG:   %[[E00_0:[0-9]+]] = OpCompositeExtract %[[Int_ID]] %[[Mul00]] 0
+; VK1_1-DAG:   %[[E00_1:[0-9]+]] = OpCompositeExtract %[[Int_ID]] %[[Mul00]] 1
+; VK1_1-DAG:   %[[C00:[0-9]+]] = OpIAdd %[[Int_ID]] %[[E00_0]] %[[E00_1]]
+; VK1_3-DAG:   %[[C00:[0-9]+]] = OpUDot %[[Int_ID]] %[[A_Row0]] %[[B_Col0]]
+;
+; -- C10 = dot(A_Row1, B_Col0)
+; VK1_1-DAG:   %[[Mul10:[0-9]+]] = OpIMul %[[V2I32_ID]] %[[A_Row1]] %[[B_Col0]]
+; VK1_1-DAG:   %[[E10_0:[0-9]+]] = OpCompositeExtract %[[Int_ID]] %[[Mul10]] 0
+; VK1_1-DAG:   %[[E10_1:[0-9]+]] = OpCompositeExtract %[[Int_ID]] %[[Mul10]] 1
+; VK1_1-DAG:   %[[C10:[0-9]+]] = OpIAdd %[[Int_ID]] %[[E10_0]] %[[E10_1]]
+; VK1_3-DAG:   %[[C10:[0-9]+]] = OpUDot %[[Int_ID]] %[[A_Row1]] %[[B_Col0]]
+;
+; -- C11 = dot(A_Row1, B_Col1)
+; VK1_1-DAG:   %[[Mul11:[0-9]+]] = OpIMul %[[V2I32_ID]] %[[A_Row1]] %[[B_Col1]]
+; VK1_1-DAG:   %[[E11_0:[0-9]+]] = OpCompositeExtract %[[Int_ID]] %[[Mul11]] 0
+; VK1_1-DAG:   %[[E11_1:[0-9]+]] = OpCompositeExtract %[[Int_ID]] %[[Mul11]] 1
+; VK1_1-DAG:   %[[C11:[0-9]+]] = OpIAdd %[[Int_ID]] %[[E11_0]] %[[E11_1]]
+; VK1_3-DAG:   %[[C11:[0-9]+]] = OpUDot %[[Int_ID]] %[[A_Row1]] %[[B_Col1]]
+;
+; -- C01 = dot(A_Row0, B_Col1)
+; VK1_1-DAG:   %[[Mul01:[0-9]+]] = OpIMul %[[V2I32_ID]] %[[A_Row0]] %[[B_Col1]]
+; VK1_1-DAG:   %[[E01_0:[0-9]+]] = OpCompositeExtract %[[Int_ID]] %[[Mul01]] 0
+; VK1_1-DAG:   %[[E01_1:[0-9]+]] = OpCompositeExtract %[[Int_ID]] %[[Mul01]] 1
+; VK1_1-DAG:   %[[C01:[0-9]+]] = OpIAdd %[[Int_ID]] %[[E01_0]] %[[E01_1]]
+; VK1_3-DAG:   %[[C01:[0-9]+]] = OpUDot %[[Int_ID]] %[[A_Row0]] %[[B_Col1]]
+;
+; CHECK:       OpCompositeConstruct %[[V4I32_ID]] %[[C00]] %[[C10]] %[[C01]] %[[C11]]
+define internal void @test_matrix_multiply_i32_2x2_2x2() {
+  %1 = load <4 x i32>, ptr addrspace(10) @private_v4i32
+  %2 = load <4 x i32>, ptr addrspace(10) @private_v4i32
+  %3 = call <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32> %1, <4 x i32> %2, i32 2, i32 2, i32 2)
+  store <4 x i32> %3, ptr addrspace(10) @private_v4i32
+  ret void
+}
+
+; Test Matrix Multiply 2x3 * 3x2 float (Result 2x2 float)
+; CHECK-LABEL: ; -- Begin function test_matrix_multiply_f32_2x3_3x2
+; CHECK-DAG:   %[[B:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]]
+; CHECK-DAG:   %[[A:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]]
+;
+; CHECK-DAG:   %[[B_Col0:[0-9]+]] = OpCompositeConstruct %[[V3F32_ID]]
+; CHECK-DAG:   %[[B_Col1:[0-9]+]] = OpCompositeConstruct %[[V3F32_ID]]
+; CHECK-DAG:   %[[A_Row0:[0-9]+]] = OpCompositeConstruct %[[V3F32_ID]]
+; CHECK-DAG:   %[[A_Row1:[0-9]+]] = OpCompositeConstruct %[[V3F32_ID]]
+;
+; CHECK-DAG:   %[[C00:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row0]] %[[B_Col0]]
+; CHECK-DAG:   %[[C10:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row1]] %[[B_Col0]]
+; CHECK-DAG:   %[[C01:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row0]] %[[B_Col1]]
+; CHECK-DAG:   %[[C11:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row1]] %[[B_Col1]]
+; CHECK:       OpCompositeConstruct %[[V4F32_ID]] %[[C00]] %[[C10]] %[[C01]] %[[C11]]
+define internal void @test_matrix_multiply_f32_2x3_3x2() {
+  %1 = load <6 x float>, ptr addrspace(10) @private_v6f32
+  %2 = load <6 x float>, ptr addrspace(10) @private_v6f32
+  %3 = call <4 x float> @llvm.matrix.multiply.v4f32.v6f32.v6f32(<6 x float> %1, <6 x float> %2, i32 2, i32 3, i32 2)
+  store <4 x float> %3, ptr addrspace(10) @private_v4f32
+  ret void
+}
+
+; Test Matrix Multiply 2x2 * 2x1 float (Result 2x1 vector)
+; CHECK-LABEL: ; -- Begin function test_matrix_multiply_f32_2x2_2x1_vec
+; CHECK:       %[[A:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] {{.*}} {{.*}} 3
+; CHECK:       %[[B:[0-9]+]] = OpCompositeInsert %[[V2F32_ID]] {{.*}} {{.*}} 1
+; CHECK-DAG:   %[[A_Row0:[0-9]+]] = OpVectorShuffle %[[V2F32_ID]] %[[A]] %[[A]] 0 2
+; CHECK-DAG:   %[[A_Row1:[0-9]+]] = OpVectorShuffle %[[V2F32_ID]] %[[A]] %[[A]] 1 3
+; CHECK-DAG:   %[[C00:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row0]] %[[B]]
+; CHECK-DAG:   %[[C10:[0-9]+]] = OpDot %[[Float_ID]] %[[A_Row1]] %[[B]]
+; CHECK:       OpCompositeConstruct %[[V2F32_ID]] %[[C00]] %[[C10]]
+define internal void @test_matrix_multiply_f32_2x2_2x1_vec() {
+  %1 = load <4 x float>, ptr addrspace(10) @private_v4f32
+  %2 = load <2 x float>, ptr addrspace(10) @private_v2f32
+  %3 = call <2 x float> @llvm.matrix.multiply.v2f32.v4f32.v2f32(<4 x float> %1, <2 x float> %2, i32 2, i32 2, i32 1)
+  store <2 x float> %3, ptr addrspace(10) @private_v2f32
+  ret void
+}
+
+; Test Matrix Multiply 1x2 * 2x2 float (Result 1x2 vector)
+; CHECK-LABEL: ; -- Begin function test_matrix_multiply_f32_1x2_2x2_vec
+; CHECK:       %[[A:[0-9]+]] = OpCompositeInsert %[[V2F32_ID]] {{.*}} {{.*}} 1
+; CHECK:       %[[B:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] {{.*}} {{.*}} 3
+; CHECK-DAG:   %[[B_Col0:[0-9]+]] = OpVectorShuffle %[[V2F32_ID]] %[[B]] %[[#]] 0 1
+; CHECK-DAG:   %[[B_Col1:[0-9]+]] = OpVectorShuffle %[[V2F32_ID]] %[[B]] %[[#]] 2 3
+; CHECK-DAG:   %[[C00:[0-9]+]] = OpDot %[[Float_ID]] %[[A]] %[[B_Col0]]
+; CHECK-DAG:   %[[C01:[0-9]+]] = OpDot %[[Float_ID]] %[[A]] %[[B_Col1]]
+; CHECK:       OpCompositeConstruct %[[V2F32_ID]] %[[C00]] %[[C01]]
+define internal void @test_matrix_multiply_f32_1x2_2x2_vec() {
+  %1 = load <2 x float>, ptr addrspace(10) @private_v2f32
+  %2 = load <4 x float>, ptr addrspace(10) @private_v4f32
+  %3 = call <2 x float> @llvm.matrix.multiply.v2f32.v2f32.v4f32(<2 x float> %1, <4 x float> %2, i32 1, i32 2, i32 2)
+  store <2 x float> %3, ptr addrspace(10) @private_v2f32
+  ret void
+}
+
+; Test Matrix Multiply 1x2 * 2x1 float (Result 1x1 scalar - OpDot)
+; TODO(171175): The SPIR-V backend does not legalize single element vectors.
+; CHECK-DISABLE: ; -- Begin function test_matrix_multiply_f32_1x2_2x1_scalar
+; define internal void @test_matrix_multiply_f32_1x2_2x1_scalar() {
+;   %1 = load <2 x float>, ptr addrspace(10) @private_v2f32
+;   %2 = load <2 x float>, ptr addrspace(10) @private_v2f32
+;   %3 = call <1 x float> @llvm.matrix.multiply.v1f32.v2f32.v2f32(<2 x float> %1, <2 x float> %2, i32 1, i32 2, i32 1)
+;   store <1 x float> %3, ptr addrspace(10) @private_v1f32
+;   ret void
+; }
+
+define void @main() #0 {
+  ret void
+}
+
+declare <4 x float> @llvm.matrix.multiply.v4f32.v4f32.v4f32(<4 x float>, <4 x float>, i32, i32, i32)
+declare <4 x i32> @llvm.matrix.multiply.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32)
+declare <4 x float> @llvm.matrix.multiply.v4f32.v6f32.v6f32(<6 x float>, <6 x float>, i32, i32, i32)
+declare <2 x float> @llvm.matrix.multiply.v2f32.v4f32.v2f32(<4 x float>, <2 x float>, i32, i32, i32)
+declare <2 x float> @llvm.matrix.multiply.v2f32.v2f32.v4f32(<2 x float>, <4 x float>, i32, i32, i32)
+; declare <1 x float> @llvm.matrix.multiply.v1f32.v2f32.v2f32(<2 x float>, <2 x float>, i32, i32, i32)
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/matrix-transpose.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/matrix-transpose.ll
new file mode 100644
index 0000000000000..3474fecae9957
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/matrix-transpose.ll
@@ -0,0 +1,124 @@
+; RUN: llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3 %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
+
+ at private_v4f32 = internal addrspace(10) global [4 x float] poison
+ at private_v6f32 = internal addrspace(10) global [6 x float] poison
+ at private_v1f32 = internal addrspace(10) global [1 x float] poison
+
+; CHECK-DAG: %[[Float_ID:[0-9]+]] = OpTypeFloat 32
+; CHECK-DAG: %[[V4F32_ID:[0-9]+]] = OpTypeVector %[[Float_ID]] 4
+
+; Test Transpose 2x2 float
+; CHECK-LABEL: ; -- Begin function test_transpose_f32_2x2
+; CHECK: %[[Shuffle:[0-9]+]] = OpVectorShuffle %[[V4F32_ID]] {{.*}} 0 2 1 3
+define internal void @test_transpose_f32_2x2() {
+ %1 = load <4 x float>, ptr addrspace(10) @private_v4f32
+ %2 = call <4 x float> @llvm.matrix.transpose.v4f32.i32(<4 x float> %1, i32 2, i32 2)
+ store <4 x float> %2, ptr addrspace(10) @private_v4f32
+ ret void
+}
+
+; Test Transpose 2x3 float (Result is 3x2 float)
+; Note: We should add more code to the prelegalizer combiner to be able to remove the insert and extracts. 
+;       This test should reduce to a series of access chains, loads, and stores.
+; CHECK-LABEL: ; -- Begin function test_transpose_f32_2x3
+define internal void @test_transpose_f32_2x3() {
+; -- Load input 2x3 matrix elements
+; CHECK: %[[AccessChain1:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID:[0-9]+]] %[[private_v6f32:[0-9]+]] %[[int_0:[0-9]+]]
+; CHECK: %[[Load1:[0-9]+]] = OpLoad %[[Float_ID]] %[[AccessChain1]]
+; CHECK: %[[AccessChain2:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_1:[0-9]+]]
+; CHECK: %[[Load2:[0-9]+]] = OpLoad %[[Float_ID]] %[[AccessChain2]]
+; CHECK: %[[AccessChain3:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_2:[0-9]+]]
+; CHECK: %[[Load3:[0-9]+]] = OpLoad %[[Float_ID]] %[[AccessChain3]]
+; CHECK: %[[AccessChain4:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_3:[0-9]+]]
+; CHECK: %[[Load4:[0-9]+]] = OpLoad %[[Float_ID]] %[[AccessChain4]]
+; CHECK: %[[AccessChain5:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_4:[0-9]+]]
+; CHECK: %[[Load5:[0-9]+]] = OpLoad %[[Float_ID]] %[[AccessChain5]]
+; CHECK: %[[AccessChain6:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_5:[0-9]+]]
+; CHECK: %[[Load6:[0-9]+]] = OpLoad %[[Float_ID]] %[[AccessChain6]]
+;
+; -- Construct intermediate vectors
+; CHECK: %[[CompositeInsert1:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] %[[Load1]] %[[undef_V4F32_ID:[0-9]+]] 0
+; CHECK: %[[CompositeInsert2:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] %[[Load2]] %[[CompositeInsert1]] 1
+; CHECK: %[[CompositeInsert3:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] %[[Load3]] %[[CompositeInsert2]] 2
+; CHECK: %[[CompositeInsert4:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] %[[Load4]] %[[CompositeInsert3]] 3
+; CHECK: %[[CompositeInsert5:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] %[[Load5]] %[[undef_V4F32_ID]] 0
+; CHECK: %[[CompositeInsert6:[0-9]+]] = OpCompositeInsert %[[V4F32_ID]] %[[Load6]] %[[CompositeInsert5]] 1
+  %1 = load <6 x float>, ptr addrspace(10) @private_v6f32
+
+; -- Extract elements for transposition
+; CHECK: %[[Extract1:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeInsert4]] 0
+; CHECK: %[[Extract2:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeInsert4]] 2
+; CHECK: %[[Extract3:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeInsert6]] 0
+; CHECK: %[[Extract4:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeInsert4]] 1
+; CHECK: %[[Extract5:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeInsert4]] 3
+; CHECK: %[[Extract6:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeInsert6]] 1
+  %2 = call <6 x float> @llvm.matrix.transpose.v6f32.i32(<6 x float> %1, i32 2, i32 3)
+
+; -- Store output 3x2 matrix elements
+; CHECK: %[[AccessChain7:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_0]]
+; CHECK: %[[CompositeConstruct1:[0-9]+]] = OpCompositeConstruct %[[V4F32_ID]] %[[Extract1]] %[[Extract2]] %[[Extract3]] %[[Extract4]]
+; CHECK: %[[Extract7:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeConstruct1]] 0
+; CHECK: OpStore %[[AccessChain7]] %[[Extract7]]
+; CHECK: %[[AccessChain8:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_1]]
+; CHECK: %[[CompositeConstruct2:[0-9]+]] = OpCompositeConstruct %[[V4F32_ID]] %[[Extract1]] %[[Extract2]] %[[Extract3]] %[[Extract4]]
+; CHECK: %[[Extract8:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeConstruct2]] 1
+; CHECK: OpStore %[[AccessChain8]] %[[Extract8]]
+; CHECK: %[[AccessChain9:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_2]]
+; CHECK: %[[CompositeConstruct3:[0-9]+]] = OpCompositeConstruct %[[V4F32_ID]] %[[Extract1]] %[[Extract2]] %[[Extract3]] %[[Extract4]]
+; CHECK: %[[Extract9:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeConstruct3]] 2
+; CHECK: OpStore %[[AccessChain9]] %[[Extract9]]
+; CHECK: %[[AccessChain10:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_3]]
+; CHECK: %[[CompositeConstruct4:[0-9]+]] = OpCompositeConstruct %[[V4F32_ID]] %[[Extract1]] %[[Extract2]] %[[Extract3]] %[[Extract4]]
+; CHECK: %[[Extract10:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeConstruct4]] 3
+; CHECK: OpStore %[[AccessChain10]] %[[Extract10]]
+; CHECK: %[[AccessChain11:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_4]]
+; CHECK: %[[CompositeConstruct5:[0-9]+]] = OpCompositeConstruct %[[V4F32_ID]] %[[Extract5]] %[[Extract6]] %[[undef_Float_ID:[0-9]+]] %[[undef_Float_ID]]
+; CHECK: %[[Extract11:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeConstruct5]] 0
+; CHECK: OpStore %[[AccessChain11]] %[[Extract11]]
+; CHECK: %[[AccessChain12:[0-9]+]] = OpAccessChain %[[_ptr_Float_ID]] %[[private_v6f32]] %[[int_5]]
+; CHECK: %[[CompositeConstruct6:[0-9]+]] = OpCompositeConstruct %[[V4F32_ID]] %[[Extract5]] %[[Extract6]] %[[undef_Float_ID]] %[[undef_Float_ID]]
+; CHECK: %[[Extract12:[0-9]+]] = OpCompositeExtract %[[Float_ID]] %[[CompositeConstruct6]] 1
+; CHECK: OpStore %[[AccessChain12]] %[[Extract12]]
+  store <6 x float> %2, ptr addrspace(10) @private_v6f32
+  ret void
+}
+
+; Test Transpose 1x4 float (Result is 4x1 float), should be a copy (vector of 4 floats)
+; CHECK-LABEL: ; -- Begin function test_transpose_f32_1x4_to_4x1
+; CHECK: %[[Shuffle:[0-9]+]] = OpVectorShuffle %[[V4F32_ID]] {{.*}} 0 1 2 3
+define internal void @test_transpose_f32_1x4_to_4x1() {
+ %1 = load <4 x float>, ptr addrspace(10) @private_v4f32
+ %2 = call <4 x float> @llvm.matrix.transpose.v4f32.i32(<4 x float> %1, i32 1, i32 4)
+ store <4 x float> %2, ptr addrspace(10) @private_v4f32
+ ret void
+}
+
+; Test Transpose 4x1 float (Result is 1x4 float), should be a copy (vector of 4 floats)
+; CHECK-LABEL: ; -- Begin function test_transpose_f32_4x1_to_1x4
+; CHECK: %[[Shuffle:[0-9]+]] = OpVectorShuffle %[[V4F32_ID]] {{.*}} 0 1 2 3
+define internal void @test_transpose_f32_4x1_to_1x4() {
+ %1 = load <4 x float>, ptr addrspace(10) @private_v4f32
+ %2 = call <4 x float> @llvm.matrix.transpose.v4f32.i32(<4 x float> %1, i32 4, i32 1)
+ store <4 x float> %2, ptr addrspace(10) @private_v4f32
+ ret void
+}
+
+; Test Transpose 1x1 float (Result is 1x1 float), should be a copy (scalar float)
+; TODO(171175): The SPIR-V backend does not seem to be legalizing single element vectors.
+; define internal void @test_transpose_f32_1x1() {
+;   %1 = load <1 x float>, ptr addrspace(10) @private_v1f32
+;   %2 = call <1 x float> @llvm.matrix.transpose.v1f32.i32(<1 x float> %1, i32 1, i32 1)
+;   store <1 x float> %2, ptr addrspace(10) @private_v1f32
+;   ret void
+; }
+
+define void @main() #0 {
+  ret void
+}
+
+declare <4 x float> @llvm.matrix.transpose.v4f32.i32(<4 x float>, i32, i32)
+declare <6 x float> @llvm.matrix.transpose.v6f32.i32(<6 x float>, i32, i32)
+; declare <1 x float> @llvm.matrix.transpose.v1f32.i32(<1 x float>, i32, i32)
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }

>From c381a0918a2e929e573c35e8e88b799a4b6e3195 Mon Sep 17 00:00:00 2001
From: Victor Chernyakin <chernyakin.victor.j at outlook.com>
Date: Sat, 27 Dec 2025 10:51:21 -0600
Subject: [PATCH 19/34] [clang-tidy] Add C support to
 `misc-use-internal-linkage` (#173196)

Right now, this check simply doesn't work in C, because we exclude
anything that `isExternC` from analysis (in C, everything `isExternC`).

Besides that, the docs and diagnostic message talk about anonymous
namespaces, which don't exist in C (this was noted in #97969, I'm just
summarizing).

The existing tests use abbreviated `// CHECK-MESSAGES` assertions (e.g.
`// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: function 'cxf'`), but I've
expanded them out. Yes, it's verbose, but now that the diagnostic
message has an important difference between C and C++, I feel it's
important that we test it.
---
 .../misc/UseInternalLinkageCheck.cpp          | 35 +++++++++++--------
 clang-tools-extra/docs/ReleaseNotes.rst       |  1 +
 .../checks/misc/use-internal-linkage.rst      | 15 ++++----
 .../misc/Inputs/use-internal-linkage/func.h   |  2 +-
 .../Inputs/use-internal-linkage/func_h.inc    |  2 +-
 .../misc/Inputs/use-internal-linkage/var.h    |  2 +-
 .../misc/use-internal-linkage-consteval.cpp   |  2 +-
 .../use-internal-linkage-fix-mode-none.cpp    |  4 +--
 .../misc/use-internal-linkage-func.cpp        | 24 ++++++-------
 .../misc/use-internal-linkage-var.cpp         | 12 +++----
 .../checkers/misc/use-internal-linkage.c      | 33 +++++++++++++++++
 11 files changed, 88 insertions(+), 44 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage.c

diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
index 617b98484f3e4..68115cb28e7c8 100644
--- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
@@ -96,6 +96,12 @@ AST_MATCHER(FunctionDecl, isAllocationOrDeallocationOverloadedFunction) {
   return OverloadedOperators.contains(Node.getOverloadedOperator());
 }
 
+AST_POLYMORPHIC_MATCHER(isExplicitlyExternC,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(FunctionDecl,
+                                                        VarDecl)) {
+  return Finder->getASTContext().getLangOpts().CPlusPlus && Node.isExternC();
+}
+
 AST_MATCHER(TagDecl, hasNameForLinkage) { return Node.hasNameForLinkage(); }
 
 AST_MATCHER(CXXRecordDecl, isExplicitTemplateInstantiation) {
@@ -142,20 +148,21 @@ void UseInternalLinkageCheck::registerMatchers(MatchFinder *Finder) {
         functionDecl(
             Common, hasBody(),
             unless(anyOf(
-                isExternC(), isStaticStorageClass(), isExternStorageClass(),
-                isExplicitTemplateSpecialization(), cxxMethodDecl(),
-                isConsteval(), isAllocationOrDeallocationOverloadedFunction(),
-                isMain())))
+                isExplicitlyExternC(), isStaticStorageClass(),
+                isExternStorageClass(), isExplicitTemplateSpecialization(),
+                cxxMethodDecl(), isConsteval(),
+                isAllocationOrDeallocationOverloadedFunction(), isMain())))
             .bind("fn"),
         this);
   if (AnalyzeVariables)
-    Finder->addMatcher(varDecl(Common, hasGlobalStorage(),
-                               unless(anyOf(isExternC(), isStaticStorageClass(),
-                                            isExternStorageClass(),
-                                            isExplicitTemplateSpecialization(),
-                                            hasThreadStorageDuration())))
-                           .bind("var"),
-                       this);
+    Finder->addMatcher(
+        varDecl(Common, hasGlobalStorage(),
+                unless(anyOf(isExplicitlyExternC(), isStaticStorageClass(),
+                             isExternStorageClass(),
+                             isExplicitTemplateSpecialization(),
+                             hasThreadStorageDuration())))
+            .bind("var"),
+        this);
   if (getLangOpts().CPlusPlus && AnalyzeTypes)
     Finder->addMatcher(
         tagDecl(Common, isDefinition(), hasNameForLinkage(),
@@ -169,13 +176,13 @@ void UseInternalLinkageCheck::registerMatchers(MatchFinder *Finder) {
 }
 
 static constexpr StringRef Message =
-    "%0 %1 can be made static or moved into an anonymous namespace "
+    "%0 %1 can be made static %select{|or moved into an anonymous namespace }2"
     "to enforce internal linkage";
 
 void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *FD = Result.Nodes.getNodeAs<FunctionDecl>("fn")) {
     const DiagnosticBuilder DB = diag(FD->getLocation(), Message)
-                                 << "function" << FD;
+                                 << "function" << FD << getLangOpts().CPlusPlus;
     const SourceLocation FixLoc = FD->getInnerLocStart();
     if (FixLoc.isInvalid() || FixLoc.isMacroID())
       return;
@@ -191,7 +198,7 @@ void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) {
       return;
 
     const DiagnosticBuilder DB = diag(VD->getLocation(), Message)
-                                 << "variable" << VD;
+                                 << "variable" << VD << getLangOpts().CPlusPlus;
     const SourceLocation FixLoc = VD->getInnerLocStart();
     if (FixLoc.isInvalid() || FixLoc.isMacroID())
       return;
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 7b1640594a3d3..86bfd1d489898 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -528,6 +528,7 @@ Changes in existing checks
   user-defined types (structs, classes, unions, and enums) internal
   linkage. Added fine-grained options to control whether the check
   should diagnose functions, variables, and/or user-defined types.
+  Enabled the check for C.
 
 - Improved :doc:`modernize-avoid-c-arrays
   <clang-tidy/checks/modernize/avoid-c-arrays>` to not diagnose array types
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/use-internal-linkage.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/use-internal-linkage.rst
index 941221573fc86..9c160756e3873 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/use-internal-linkage.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/use-internal-linkage.rst
@@ -4,7 +4,7 @@ misc-use-internal-linkage
 =========================
 
 Detects variables, functions, and classes that can be marked as static or
-moved into an anonymous namespace to enforce internal linkage.
+(in C++) moved into an anonymous namespace to enforce internal linkage.
 
 Any entity that's only used within a single file should be given internal
 linkage. Doing so gives the compiler more information, allowing it to better
@@ -18,6 +18,14 @@ Example:
 
   void fn1() {} // can be marked as static
 
+  // already declared as extern
+  extern int v2;
+
+  void fn3(); // without function body in all declaration, maybe external linkage
+  void fn3();
+
+  // === C++-specific ===
+
   struct S1 {}; // can be moved into anonymous namespace
 
   namespace {
@@ -26,11 +34,6 @@ Example:
     void fn2();
     struct S2 {};
   }
-  // already declared as extern
-  extern int v2;
-
-  void fn3(); // without function body in all declaration, maybe external linkage
-  void fn3();
 
   // export declarations
   export void fn4() {}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/func.h b/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/func.h
index 0f2b576a126c4..c0f967523d1c3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/func.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/func.h
@@ -1,5 +1,5 @@
 #pragma once
 
-void func_header();
+void func_header(void);
 
 #include "func_h.inc"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/func_h.inc b/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/func_h.inc
index 1130f710edd7c..bd29019080de3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/func_h.inc
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/func_h.inc
@@ -1 +1 @@
-void func_h_inc();
+void func_h_inc(void);
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/var.h b/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/var.h
index 37e4cfbafff14..844c61a1135af 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/var.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/use-internal-linkage/var.h
@@ -1,3 +1,3 @@
 #pragma once
 
-extern int gloabl_header;
+extern int global_header;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-consteval.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-consteval.cpp
index 62c9818e07c4f..b63e87da25cea 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-consteval.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-consteval.cpp
@@ -3,5 +3,5 @@
 consteval void gh122096() {}
 
 constexpr void cxf() {}
-// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: function 'cxf'
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: function 'cxf' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static constexpr void cxf() {}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-fix-mode-none.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-fix-mode-none.cpp
index 3f2f5897bf718..8eb4a40d2d7d6 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-fix-mode-none.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-fix-mode-none.cpp
@@ -2,9 +2,9 @@
 // RUN:   -config="{CheckOptions: {misc-use-internal-linkage.FixMode: 'None'}}"  -- -I%S/Inputs/use-internal-linkage
 
 void func() {}
-// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func'
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES-NOT: static void func() {}
 
 int global;
-// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: variable 'global'
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: variable 'global' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES-NOT: static int global;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
index d25e4383613f7..764208443bdc1 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
@@ -14,51 +14,51 @@
 #include "func.h"
 
 void func() {}
-// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func'
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static void func() {}
 
 template<class T>
 void func_template() {}
-// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_template'
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_template' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static void func_template() {}
 
 void func_cpp_inc() {}
-// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc'
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static void func_cpp_inc() {}
 
 int* func_cpp_inc_return_ptr() { return nullptr; }
-// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc_return_ptr'
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc_return_ptr' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static int* func_cpp_inc_return_ptr() { return nullptr; }
 
 const int* func_cpp_inc_return_const_ptr() { return nullptr; }
-// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_const_ptr'
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_const_ptr' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static const int* func_cpp_inc_return_const_ptr() { return nullptr; }
 
 int const* func_cpp_inc_return_ptr_const() { return nullptr; }
-// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_ptr_const'
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_ptr_const' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static int const* func_cpp_inc_return_ptr_const() { return nullptr; }
 
 int * const func_cpp_inc_return_const() { return nullptr; }
-// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: function 'func_cpp_inc_return_const'
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: function 'func_cpp_inc_return_const' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static int * const func_cpp_inc_return_const() { return nullptr; }
 
 volatile const int* func_cpp_inc_return_volatile_const_ptr() { return nullptr; }
-// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: function 'func_cpp_inc_return_volatile_const_ptr'
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: function 'func_cpp_inc_return_volatile_const_ptr' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static volatile const int* func_cpp_inc_return_volatile_const_ptr() { return nullptr; }
 
 [[nodiscard]] void func_nodiscard() {}
-// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: function 'func_nodiscard'
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: function 'func_nodiscard' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: {{\[\[nodiscard\]\]}} static void func_nodiscard() {}
 
 #define NDS [[nodiscard]]
 #define NNDS
 
 NDS void func_nds() {}
-// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: function 'func_nds'
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: function 'func_nds' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: NDS static void func_nds() {}
 
 NNDS void func_nnds() {}
-// CHECK-MESSAGES: :[[@LINE-1]]:11: warning: function 'func_nnds'
+// CHECK-MESSAGES: :[[@LINE-1]]:11: warning: function 'func_nnds' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: NNDS static void func_nnds() {}
 
 #include "func_cpp.inc"
@@ -87,7 +87,7 @@ extern "C" void func_extern_c_2() {}
 
 namespace gh117488 {
 void func_with_body();
-// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_with_body'
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_with_body' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static void func_with_body();
 void func_with_body() {}
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp
index 1be7165f9ffe6..7de5259a0a160 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp
@@ -14,27 +14,27 @@
 #include "var.h"
 
 int global;
-// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: variable 'global'
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: variable 'global' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static int global;
 
 template<class T>
 T global_template;
-// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: variable 'global_template'
+// CHECK-MESSAGES: :[[@LINE-1]]:3: warning: variable 'global_template' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static T global_template;
 
 int const* ptr_const_star;
-// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'ptr_const_star'
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'ptr_const_star' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static int const* ptr_const_star;
 
 const int* const_ptr_star;
-// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'const_ptr_star'
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'const_ptr_star' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static const int* const_ptr_star;
 
 const volatile int* const_volatile_ptr_star;
-// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: variable 'const_volatile_ptr_star'
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: variable 'const_volatile_ptr_star' can be made static or moved into an anonymous namespace to enforce internal linkage
 // CHECK-FIXES: static const volatile int* const_volatile_ptr_star;
 
-int gloabl_header;
+int global_header;
 
 extern int global_extern;
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage.c b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage.c
new file mode 100644
index 0000000000000..05f9349ea3214
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage.c
@@ -0,0 +1,33 @@
+// RUN: %check_clang_tidy %s misc-use-internal-linkage %t -- -- -I%S/Inputs/use-internal-linkage
+// RUN: %check_clang_tidy %s misc-use-internal-linkage %t -- \
+// RUN:   -config="{CheckOptions: {misc-use-internal-linkage.FixMode: 'UseStatic'}}"  -- -I%S/Inputs/use-internal-linkage
+
+#include "func.h"
+
+void func(void) {}
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func' can be made static to enforce internal linkage
+// CHECK-FIXES: static void func(void) {}
+
+void func_header(void) {}
+extern void func_extern(void) {}
+static void func_static(void) {}
+
+int main(void) {}
+
+
+#include "var.h"
+
+int global;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: variable 'global' can be made static to enforce internal linkage
+// CHECK-FIXES: static int global;
+
+const int const_global = 123;
+// CHECK-MESSAGES: :[[@LINE-1]]:11: warning: variable 'const_global' can be made static to enforce internal linkage
+// CHECK-FIXES: static const int const_global = 123;
+
+int global_header;
+extern int global_extern;
+static int global_static;
+#if __STDC_VERSION__ >= 201112L
+_Thread_local int global_thread_local;
+#endif

>From d777b1a23038331320b83e9b313f0dd40f675273 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 27 Dec 2025 17:02:47 +0000
Subject: [PATCH 20/34] [VPlan] Skip phi recipes in tryToBuildVPlan (NFC).

No phi recipes are being transformed in the main loop any longer, so
skip phi recipes.

This also allows to clarify which recipes need skipping explicitly.
Those are recipes that have been already transformed.

Follow-up to post-commit comment in
https://github.com/llvm/llvm-project/pull/168291.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2c62f64be1c9b..40c3efb644527 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8457,12 +8457,13 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // Now process all other blocks and instructions.
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
     // Convert input VPInstructions to widened recipes.
-    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      auto *VPI = dyn_cast<VPInstruction>(&R);
-      // Skip recipes that do not need transforming, including
-      // non-VPInstructions (such as ...) and VPInstructions without underlying
-      // values. The latter are added above for masking.
-      if (!VPI || !VPI->getUnderlyingValue())
+    for (VPRecipeBase &R : make_early_inc_range(
+             make_range(VPBB->getFirstNonPhi(), VPBB->end()))) {
+      // Skip recipes that do not need transforming.
+      if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe>(&R))
+        continue;
+      auto *VPI = cast<VPInstruction>(&R);
+      if (!VPI->getUnderlyingValue())
         continue;
 
       // TODO: Gradually replace uses of underlying instruction by analyses on

>From 2c376ffeca490a5732e4fd6e98e5351fcf6d692a Mon Sep 17 00:00:00 2001
From: Muhammad Abdul <alilo.ghazali at gmail.com>
Date: Sun, 28 Dec 2025 00:51:29 +0700
Subject: [PATCH 21/34] [AMDGPU] add clamp immediate operand to WMMA iu8
 intrinsic (#171069)

Fixes #166989

- Adds a clamp immediate operand to the AMDGPU WMMA iu8 intrinsic and
threads it through LLVM IR, MIR lowering, Clang builtins/tests, and MLIR
ROCDL dialect so all layers agree on the new operand
- Updates AMDGPUWmmaIntrinsicModsAB so the clamp attribute is emitted,
teaches VOP3P encoding to accept the immediate, and adjusts Clang
codegen/builtin headers plus MLIR op definitions and tests to match
- Documents what the WMMA clamp operand do
- Implement bitcode AutoUpgrade for source compatibility on WMMA IU8
Intrinsic op

Possible future enhancements:
- infer clamping as an optimization fold based on the use context

---------

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  2 +-
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  7 ++++
 clang/lib/Sema/SemaAMDGPU.cpp                 | 10 ++++++
 .../builtins-amdgcn-gfx1250-wmma-w32.cl       | 15 +++++++--
 ...ins-amdgcn-error-gfx1250-wmma-w32-param.cl | 14 +++++---
 llvm/docs/AMDGPUUsage.rst                     |  8 +++++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  3 +-
 llvm/lib/IR/AutoUpgrade.cpp                   | 32 +++++++++++++++++++
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  4 ++-
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  6 ++--
 .../Bitcode/amdgpu-wmma-iu8-clamp-upgrade.ll  | 21 ++++++++++++
 .../AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll    |  4 +--
 .../llvm.amdgcn.wmma.imm.gfx1250.w32.ll       |  8 ++---
 .../llvm.amdgcn.wmma.imod.gfx1250.w32.ll      |  6 ++--
 .../AMDGPU/wmma-coececution-valu-hazards.mir  | 24 +++++++-------
 .../AMDGPU/wmma-hazards-gfx1250-w32.mir       | 20 ++++++------
 llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s    | 11 +++++--
 .../test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s |  3 --
 .../AMDGPU/gfx1250_dasm_wmma_w32.txt          |  6 ++++
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  | 11 ++++---
 mlir/test/Target/LLVMIR/rocdl.mlir            | 18 +++++++----
 21 files changed, 172 insertions(+), 61 deletions(-)
 create mode 100644 llvm/test/Bitcode/amdgpu-wmma-iu8-clamp-upgrade.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 224aa2ea30bad..dad45556bec63 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -857,7 +857,7 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8, "V8hV8iV8iIsV8hIbIb",
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb.", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index eabdc370da6b4..8432c4623c8ad 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -1665,6 +1665,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     if (AppendFalseForOpselArg)
       Args.push_back(Builder.getFalse());
 
+    if (BuiltinID == AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8) {
+      if (Args.size() == 7)
+        Args.push_back(Builder.getFalse());
+      if (Args.size() == 8)
+        Args[7] = Builder.CreateZExtOrTrunc(Args[7], Builder.getInt1Ty());
+    }
+
     SmallVector<llvm::Type *, 6> ArgTypes;
     if (NeedReturnType)
       ArgTypes.push_back(ConvertType(E->getType()));
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index cece22092bb14..a896a4de39ad1 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -71,6 +71,16 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_get_fpenv:
   case AMDGPU::BI__builtin_amdgcn_set_fpenv:
     return false;
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+    // Legacy form omitted the optional clamp operand
+    if (SemaRef.checkArgCountRange(TheCall, 7, 8))
+      return true;
+    if (TheCall->getNumArgs() == 8) {
+      llvm::APSInt Result;
+      if (SemaRef.BuiltinConstantArg(TheCall, 7, Result))
+        return true;
+    }
+    return false;
   case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
   case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
   case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
index bdb1a7f0bb32f..bfae3299f8e95 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -148,13 +148,24 @@ void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h c)
 
 // CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8(
 // CHECK-GFX1250-NEXT:  entry:
-// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true, i1 false)
 // CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c)
 {
-  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true);
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true, false);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8_no_clamp(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 false, i1 false)
+// CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x64_iu8_no_clamp(global v8i* out, v8i a, v8i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, false);
 }
 
 // CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_f8f6f4(
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
index 49ef2e571740c..05cfa9a2f7828 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -108,10 +108,16 @@ void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h c,
 
 void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c, int mod)
 {
-  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(mod, a, 0, b, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
-  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, mod, b, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
-  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
-  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(mod, a, 0, b, c, false, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, mod, b, c, false, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, mod, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, false, mod); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_i32_16x16x64_iu8_too_many(global v8i *out, v8i a, v8i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, false, false, false); // expected-error {{too many arguments to function call, expected at most 8, have 9}}
 }
 
 void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, int mod)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 7ecf1c1124894..21c282fae305b 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1602,6 +1602,14 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
    List AMDGPU intrinsics.
 
+WMMA clamp operand
+~~~~~~~~~~~~~~~~~~
+
+The WMMA integer matrix multiply intrinsics and C builtins (IU4/IU8, wave32 and
+wave64 forms) accept an optional boolean clamp operand. It defaults to 0 (no
+saturation) for backward compatibility. When set, the hardware clamps the
+32-bit accumulation result instead of allowing wraparound.
+
 '``llvm.amdgcn.cooperative.atomic``' Intrinsics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 19d5f24c5d5e0..2afe89357a991 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3968,8 +3968,9 @@ class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> :
       LLVMMatchType<0>,               // %C
       llvm_i1_ty,       // matrix_a_reuse
       llvm_i1_ty,       // matrix_b_reuse
+      llvm_i1_ty,       // %clamp
     ],
-    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
      IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index c8004ee53c529..deb5f41a02b06 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
@@ -1284,6 +1285,13 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
         break; // No other 'amdgcn.atomic.*'
       }
 
+      if (F->arg_size() == 7 &&
+          F->getIntrinsicID() == Intrinsic::amdgcn_wmma_i32_16x16x64_iu8) {
+        // Legacy wmma iu8 intrinsic without the optional clamp operand.
+        NewFn = nullptr;
+        return true;
+      }
+
       if (Name.consume_front("ds.") || Name.consume_front("global.atomic.") ||
           Name.consume_front("flat.atomic.")) {
         if (Name.starts_with("fadd") ||
@@ -4613,6 +4621,30 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 //
 static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
                                          Function *F, IRBuilder<> &Builder) {
+  if (CI->arg_size() == 7 &&
+      F->getIntrinsicID() == Intrinsic::amdgcn_wmma_i32_16x16x64_iu8) {
+    // Legacy WMMA IU8 intrinsic lacked the optional clamp operand. Append
+    // clamp=false for compatibility.
+
+    SmallVector<Value *, 8> Args(CI->args().begin(), CI->args().end());
+    Args.push_back(Builder.getFalse());
+
+    Function *NewDecl = Intrinsic::getOrInsertDeclaration(
+        F->getParent(), Intrinsic::amdgcn_wmma_i32_16x16x64_iu8,
+        {CI->getArgOperand(4)->getType(), CI->getArgOperand(1)->getType()});
+
+    SmallVector<OperandBundleDef, 1> Bundles;
+    CI->getOperandBundlesAsDefs(Bundles);
+
+    auto *NewCall = cast<CallInst>(Builder.CreateCall(NewDecl, Args, Bundles));
+    NewCall->setTailCallKind(cast<CallInst>(CI)->getTailCallKind());
+    NewCall->setCallingConv(CI->getCallingConv());
+    NewCall->setAttributes(CI->getAttributes());
+    NewCall->setDebugLoc(CI->getDebugLoc());
+    NewCall->copyMetadata(*CI);
+    return NewCall;
+  }
+
   AtomicRMWInst::BinOp RMWOp =
       StringSwitch<AtomicRMWInst::BinOp>(Name)
           .StartsWith("ds.fadd", AtomicRMWInst::FAdd)
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index bf87a48487ab4..d06b22290375e 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1803,7 +1803,9 @@ def F32_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f
 def F16_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
 def F16_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
 def F32_32X16X128_F4_WMMA_w32    : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>;
-def I32_IU8X64_WMMA_w32          : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>;
+def I32_IU8X64_WMMA_w32          : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1> {
+  let HasClamp = 1;
+}
 def F32_32X16X128_F4_SCALE_w32   : VOP3PWMMA_Profile<[v16f32, v16i32,  v8i32,  v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>;
 def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32,  v8i32,  v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>;
 def F32_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index d5c6000a1eef6..8bd28d711ebf7 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -295,9 +295,9 @@ define amdgpu_kernel void @wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B,
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> %A, i1 false, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> %A, i1 false, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false, i1 false)
 define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
-  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false, i1 false)
   store <8 x i32> %tmp0, ptr addrspace(1) %out
   ret void
 }
@@ -903,7 +903,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>,
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1)
diff --git a/llvm/test/Bitcode/amdgpu-wmma-iu8-clamp-upgrade.ll b/llvm/test/Bitcode/amdgpu-wmma-iu8-clamp-upgrade.ll
new file mode 100644
index 0000000000000..7a4b89d91d13c
--- /dev/null
+++ b/llvm/test/Bitcode/amdgpu-wmma-iu8-clamp-upgrade.ll
@@ -0,0 +1,21 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Verify that the legacy WMMA IU8 intrinsic without the clamp operand is
+; upgraded by appending clamp=false.
+
+define <8 x i32> @wmma_legacy(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c) {
+; CHECK-LABEL: @wmma_legacy(
+; CHECK-NEXT: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> %a, i1 false, <8 x i32> %b, <8 x i32> %c, i1 false, i1 false, i1 false) #1, !annotation !0
+; CHECK-NEXT: ret <8 x i32>
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(
+      i1 false, <8 x i32> %a, i1 false, <8 x i32> %b, <8 x i32> %c,
+      i1 false, i1 false) #0, !annotation !0
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(
+  i1, <8 x i32>, i1, <8 x i32>, <8 x i32>, i1, i1)
+
+attributes #0 = { "preserve-me" }
+
+!0 = !{!"preserve-me"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
index 1150578a5ae92..f56a93e470bd3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -285,7 +285,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
 ; GISEL-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 true)
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 true, i1 false)
   store <8 x i32> %res, ptr addrspace(1) %out
   ret void
 }
@@ -2968,7 +2968,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>,
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
index 037e26087eaa5..e439777429a88 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
@@ -1149,7 +1149,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, pt
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
 ; GISEL-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 false, i1 false)
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 false, i1 false, i1 false)
   store <8 x i32> %res, ptr addrspace(1) %out
   ret void
 }
@@ -1191,7 +1191,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
 ; GISEL-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 1, i32 1, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 false, i1 false)
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 1, i32 1, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 false, i1 false, i1 false)
   store <8 x i32> %res, ptr addrspace(1) %out
   ret void
 }
@@ -1235,7 +1235,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, <
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
 ; GISEL-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 false, i1 false)
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 false, i1 false, i1 false)
   store <8 x i32> %res, ptr addrspace(1) %out
   ret void
 }
@@ -3020,7 +3020,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>,
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
index eb7c15587654c..8e3c07e7eb17c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
@@ -989,7 +989,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedA(<8 x i32> %A, <8 x i32
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
 ; GISEL-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 1, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 1, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false, i1 false)
   store <8 x i32> %res, ptr addrspace(1) %out
   ret void
 }
@@ -1013,7 +1013,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedB(<8 x i32> %A, <8 x i32
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
 ; GISEL-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 1, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 1, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false, i1 false)
   store <8 x i32> %res, ptr addrspace(1) %out
   ret void
 }
@@ -2538,7 +2538,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>,
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
index 2f7a6e257bb96..acfe15a9d8c84 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
@@ -319,7 +319,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1
-    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -329,7 +329,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
 ...
 
@@ -338,7 +338,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between
-    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
     ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
@@ -348,7 +348,7 @@ body: |
     ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
     ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr40 = V_MOV_B32_e32 40, implicit $exec
     $vgpr41 = V_MOV_B32_e32 41, implicit $exec
     $vgpr42 = V_MOV_B32_e32 42, implicit $exec
@@ -365,7 +365,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between
-    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
     ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
     ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
@@ -383,7 +383,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $sgpr0 = S_MOV_B32 0
     $sgpr1 = S_MOV_B32 1
     $sgpr2 = S_MOV_B32 2
@@ -400,7 +400,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1
-    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -410,7 +410,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
 ...
 
@@ -419,7 +419,7 @@ name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1
-    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -429,7 +429,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
 ...
 
@@ -438,7 +438,7 @@ name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1
-    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -448,7 +448,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
index fa3b9244c3e4a..cbea38155cef9 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
@@ -574,7 +574,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1
-    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -584,9 +584,9 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
-    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
-    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
 
 ---
@@ -594,7 +594,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1
-    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -604,9 +604,9 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
-    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
-    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
 ...
 
 ---
@@ -614,7 +614,7 @@ name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1
 body: |
   bb.0:
     ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1
-    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
@@ -625,7 +625,7 @@ body: |
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: V_NOP_e32 implicit $exec
     ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
-    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
     $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
 ...
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
index fcfff9ac5b63d..24f4feca41737 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
@@ -483,14 +483,19 @@ v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x5c]
 // WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
-v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] clamp
 // GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x72,0xcc,0x00,0x11,0x42,0x1c]
+// GFX1250-DAG: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] clamp ; encoding: [0x10,0x80,0x72,0xcc,0x00,0x11,0x42,0x1c]
 // WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 // GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x72,0xcc,0x00,0x11,0x42,0x1c]
+// GFX1250-DAG: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x72,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250-DAG: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x72,0xcc,0x00,0x11,0x42,0x1c]
 // WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
index 41cac9d1470ae..38a9e12f4a284 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
@@ -126,9 +126,6 @@ v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], s[16:23]
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 128
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] clamp
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
index 5d73cbd512edb..92031a0f28b54 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
@@ -1,6 +1,12 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
 
+0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x80,0x72,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] clamp ; encoding: [0x10,0x80,0x72,0xcc,0x00,0x11,0x42,0x1c]
+
 0x18,0x00,0x68,0xcc,0x00,0x11,0x72,0x1c
 # GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x68,0xcc,0x00,0x11,0x72,0x1c]
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index f0a9d97b6daaf..66fae7dc3aa6e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -805,15 +805,16 @@ class ROCDL_WMMA_ModsAll_Diff_IntrOp<string mnemonic, Type AB, Type C, Type D> :
 }
 
 class ROCDL_WMMA_ModsAB_IntrOp<string mnemonic, Type AB, Type CD> : ROCDL_IntrOp<mnemonic,
-    [0], [1], [], 1, 0, 0, 0, [0, 2, 5, 6], ["signA", "signB", "reuseA","reuseB"]>,
-  Arguments<(ins
+    [0], [1], [], 1, 0, 0, 0, [0, 2, 5, 6, 7], ["signA", "signB", "reuseA","reuseB", "clamp"]>,
+  Arguments<(ins 
              DefaultValuedAttr<I1Attr, "0">:$signA,
              LLVM_ScalarOrVectorOf<AB>:$a,
              DefaultValuedAttr<I1Attr, "0">:$signB,
              LLVM_ScalarOrVectorOf<AB>:$b,
-             LLVM_ScalarOrVectorOf<CD>:$c,
-             DefaultValuedAttr<I1Attr, "0">:$reuseA,
-             DefaultValuedAttr<I1Attr, "0">:$reuseB)> {
+             LLVM_ScalarOrVectorOf<CD>:$c, 
+             DefaultValuedAttr<I1Attr, "0">:$reuseA, 
+             DefaultValuedAttr<I1Attr, "0">:$reuseB,
+             DefaultValuedAttr<I1Attr, "0">:$clamp)> {
   let results = (outs LLVM_ScalarOrVectorOf<CD>:$res);
   let assemblyFormat = [{
     $a `,` $b `,` $c attr-dict `:` functional-type(operands, $res)
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 9022beb71ee31..f78c53a28b896 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1156,16 +1156,20 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
   %r22.gfx1250 = rocdl.wmma.f16.16x16x128.bf8_bf8 %arg5, %arg5, %arg15 {signA = false, signB = false, modC = 0 : i16} : (vector<4xi32>, vector<4xi32>, vector<64xf16>) -> vector<64xf16>
 
   // iu8 -> i32
-  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 false, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 false, i1 false)
-  %r23.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = false, signB = false} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> vector<64xi32>
+  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 false, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 false, i1 false, i1 false)
+  %r23.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = false, signB = false, clamp = false} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> vector<64xi32>
 
-  // Test signA=true, signB=true for iu8 gfx1250
-  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 true, <4 x i32> %{{.*}} i1 true, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 false, i1 false)
-  %r23a.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = true, signB = true} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> vector<64xi32>
+  // Test signA=true, signB=true for iu8 gfx1250  
+  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 true, <4 x i32> %{{.*}} i1 true, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 false, i1 false, i1 false)
+  %r23a.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = true, signB = true, clamp = false} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> vector<64xi32>
 
   // Test signA=true, signB=false, reuseA=true, reuseB=true for iu8 gfx1250
-  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 true, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 true, i1 true)
-  %r23b.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = true, signB = false, reuseA = true, reuseB = true} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> vector<64xi32>
+  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 true, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 true, i1 true, i1 false)
+  %r23b.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = true, signB = false, reuseA = true, reuseB = true, clamp = false} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> vector<64xi32>
+
+  // Test clamp=true for iu8 gfx1250
+  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 false, <4 x i32> %{{.*}} i1 false, <4 x i32> %{{.*}} <64 x i32> %{{.*}} i1 false, i1 false, i1 true)
+  %r23c.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %arg5, %arg5, %arg14 {signA = false, signB = false, clamp = true} : (vector<4xi32>, vector<4xi32>, vector<64xi32>) -> vector<64xi32>
 
   // Test signA=true, signB=true with modC=1 for f32 gfx1250
   // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v4f32.v16f32(i1 true, <16 x float> %{{.*}} i1 true, <16 x float> %{{.*}} i16 1, <4 x float> %{{.*}} i1 false, i1 false)

>From 1ee3178f95665ec2a494dad622d9f0175f4ed883 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Sat, 27 Dec 2025 09:58:46 -0800
Subject: [PATCH 22/34] [LegalizeDAG] Remove unnecessary EVT->MVT->EVT
 conversion. NFC (#173707)

There doesn't appear to be any reason to use MVT here. All of the uses
expect an EVT.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index cb42d420ab3ca..6476b828448c5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4033,7 +4033,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::SMUL_LOHI: {
     SDValue LHS = Node->getOperand(0);
     SDValue RHS = Node->getOperand(1);
-    MVT VT = LHS.getSimpleValueType();
+    EVT VT = LHS.getValueType();
     unsigned MULHOpcode =
         Node->getOpcode() == ISD::UMUL_LOHI ? ISD::MULHU : ISD::MULHS;
 
@@ -4044,7 +4044,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
 
     SmallVector<SDValue, 4> Halves;
-    EVT HalfType = EVT(VT).getHalfSizedIntegerVT(*DAG.getContext());
+    EVT HalfType = VT.getHalfSizedIntegerVT(*DAG.getContext());
     assert(TLI.isTypeLegal(HalfType));
     if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, dl, LHS, RHS, Halves,
                            HalfType, DAG,

>From f59e2b20ead28738ea6350a922ed1867f5d47139 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97 at outlook.com>
Date: Sun, 28 Dec 2025 08:29:06 +0800
Subject: [PATCH 23/34] [CodeGen] Port gc-empty-basic-blocks to new pass
 manager (#137585)

Co-authored-by: Aiden Grossman <aidengrossman at google.com>
---
 .../include/llvm/CodeGen/GCEmptyBasicBlocks.h | 24 +++++++++
 llvm/include/llvm/CodeGen/Passes.h            |  2 +-
 llvm/include/llvm/InitializePasses.h          |  2 +-
 .../llvm/Passes/MachinePassRegistry.def       |  2 +-
 llvm/lib/CodeGen/GCEmptyBasicBlocks.cpp       | 32 ++++++++----
 llvm/lib/CodeGen/TargetPassConfig.cpp         |  2 +-
 llvm/lib/Passes/PassBuilder.cpp               |  1 +
 .../CodeGen/X86/gc-empty-basic-blocks.mir     | 51 +++++++++++++++++++
 8 files changed, 103 insertions(+), 13 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/GCEmptyBasicBlocks.h
 create mode 100644 llvm/test/CodeGen/X86/gc-empty-basic-blocks.mir

diff --git a/llvm/include/llvm/CodeGen/GCEmptyBasicBlocks.h b/llvm/include/llvm/CodeGen/GCEmptyBasicBlocks.h
new file mode 100644
index 0000000000000..a795ece07cdf5
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/GCEmptyBasicBlocks.h
@@ -0,0 +1,24 @@
+//===-- GCEmptyBasicBlocks.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_GCEMPTYBASICBLOCKS_H
+#define LLVM_CODEGEN_GCEMPTYBASICBLOCKS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class GCEmptyBasicBlocksPass : public PassInfoMixin<GCEmptyBasicBlocksPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_GCEMPTYBASICBLOCKS_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index fe19e6a32c680..5928013035377 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -61,7 +61,7 @@ LLVM_ABI FunctionPass *createUnreachableBlockEliminationPass();
 /// instructions. These blocks confuscate profile analysis (e.g., basic block
 /// sections) since they will share the address of their fallthrough blocks.
 /// This pass garbage-collects such basic blocks.
-LLVM_ABI MachineFunctionPass *createGCEmptyBasicBlocksPass();
+LLVM_ABI MachineFunctionPass *createGCEmptyBasicBlocksLegacyPass();
 
 /// createBasicBlockSections Pass - This pass assigns sections to machine
 /// basic blocks and is enabled with -fbasic-block-sections.
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 2bc00a2f71e1a..80d37759b683d 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -124,7 +124,7 @@ LLVM_ABI void initializeFixIrreduciblePass(PassRegistry &);
 LLVM_ABI void initializeFixupStatepointCallerSavedLegacyPass(PassRegistry &);
 LLVM_ABI void initializeFlattenCFGLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeFuncletLayoutPass(PassRegistry &);
-LLVM_ABI void initializeGCEmptyBasicBlocksPass(PassRegistry &);
+LLVM_ABI void initializeGCEmptyBasicBlocksLegacyPass(PassRegistry &);
 LLVM_ABI void initializeGCMachineCodeAnalysisPass(PassRegistry &);
 LLVM_ABI void initializeGCModuleInfoPass(PassRegistry &);
 LLVM_ABI void initializeGVNLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 04a0da06fb6ec..fe08d5411e50c 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -116,6 +116,7 @@ MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass())
 MACHINE_FUNCTION_PASS("finalizebundle-test", FinalizeBundleTestPass())
 MACHINE_FUNCTION_PASS("fixup-statepoint-caller-saved", FixupStatepointCallerSavedPass())
 MACHINE_FUNCTION_PASS("init-undef", InitUndefPass())
+MACHINE_FUNCTION_PASS("gc-empty-basic-blocks", GCEmptyBasicBlocksPass())
 MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass())
 MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass())
 MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass())
@@ -274,7 +275,6 @@ DUMMY_MACHINE_FUNCTION_PASS("cfi-instr-inserter", CFIInstrInserterPass)
 DUMMY_MACHINE_FUNCTION_PASS("dot-machine-cfg", MachineCFGPrinter)
 DUMMY_MACHINE_FUNCTION_PASS("fs-profile-loader", MIRProfileLoaderNewPass)
 DUMMY_MACHINE_FUNCTION_PASS("funclet-layout", FuncletLayoutPass)
-DUMMY_MACHINE_FUNCTION_PASS("gc-empty-basic-blocks", GCEmptyBasicBlocksPass)
 DUMMY_MACHINE_FUNCTION_PASS("implicit-null-checks", ImplicitNullChecksPass)
 DUMMY_MACHINE_FUNCTION_PASS("instruction-select", InstructionSelectPass)
 DUMMY_MACHINE_FUNCTION_PASS("irtranslator", IRTranslatorPass)
diff --git a/llvm/lib/CodeGen/GCEmptyBasicBlocks.cpp b/llvm/lib/CodeGen/GCEmptyBasicBlocks.cpp
index 98470a1507668..53f804588287e 100644
--- a/llvm/lib/CodeGen/GCEmptyBasicBlocks.cpp
+++ b/llvm/lib/CodeGen/GCEmptyBasicBlocks.cpp
@@ -11,6 +11,7 @@
 /// pass.
 ///
 //===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GCEmptyBasicBlocks.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -26,22 +27,35 @@ using namespace llvm;
 
 STATISTIC(NumEmptyBlocksRemoved, "Number of empty blocks removed");
 
-class GCEmptyBasicBlocks : public MachineFunctionPass {
+static bool removeEmptyBlocks(MachineFunction &MF);
+
+PreservedAnalyses
+GCEmptyBasicBlocksPass::run(MachineFunction &MF,
+                            MachineFunctionAnalysisManager &MFAM) {
+  bool Changed = removeEmptyBlocks(MF);
+  if (Changed)
+    return getMachineFunctionPassPreservedAnalyses();
+  return PreservedAnalyses::all();
+}
+
+class GCEmptyBasicBlocksLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  GCEmptyBasicBlocks() : MachineFunctionPass(ID) {
-    initializeGCEmptyBasicBlocksPass(*PassRegistry::getPassRegistry());
+  GCEmptyBasicBlocksLegacy() : MachineFunctionPass(ID) {
+    initializeGCEmptyBasicBlocksLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   StringRef getPassName() const override {
     return "Remove Empty Basic Blocks.";
   }
 
-  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    return removeEmptyBlocks(MF);
+  }
 };
 
-bool GCEmptyBasicBlocks::runOnMachineFunction(MachineFunction &MF) {
+bool removeEmptyBlocks(MachineFunction &MF) {
   if (MF.size() < 2)
     return false;
   MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
@@ -88,12 +102,12 @@ bool GCEmptyBasicBlocks::runOnMachineFunction(MachineFunction &MF) {
   return NumRemoved != 0;
 }
 
-char GCEmptyBasicBlocks::ID = 0;
-INITIALIZE_PASS(GCEmptyBasicBlocks, "gc-empty-basic-blocks",
+char GCEmptyBasicBlocksLegacy::ID = 0;
+INITIALIZE_PASS(GCEmptyBasicBlocksLegacy, "gc-empty-basic-blocks",
                 "Removes empty basic blocks and redirects their uses to their "
                 "fallthrough blocks.",
                 false, false)
 
-MachineFunctionPass *llvm::createGCEmptyBasicBlocksPass() {
-  return new GCEmptyBasicBlocks();
+MachineFunctionPass *llvm::createGCEmptyBasicBlocksLegacyPass() {
+  return new GCEmptyBasicBlocksLegacy();
 }
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index ca75d3b47cf3a..02284307aa3b4 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1246,7 +1246,7 @@ void TargetPassConfig::addMachinePasses() {
   }
 
   if (GCEmptyBlocks)
-    addPass(llvm::createGCEmptyBasicBlocksPass());
+    addPass(llvm::createGCEmptyBasicBlocksLegacyPass());
 
   if (EnableFSDiscriminator)
     addPass(createMIRAddFSDiscriminatorsPass(
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 84ee043b5da56..8bb78c8c7df63 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -98,6 +98,7 @@
 #include "llvm/CodeGen/FEntryInserter.h"
 #include "llvm/CodeGen/FinalizeISel.h"
 #include "llvm/CodeGen/FixupStatepointCallerSaved.h"
+#include "llvm/CodeGen/GCEmptyBasicBlocks.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
 #include "llvm/CodeGen/GlobalMerge.h"
diff --git a/llvm/test/CodeGen/X86/gc-empty-basic-blocks.mir b/llvm/test/CodeGen/X86/gc-empty-basic-blocks.mir
new file mode 100644
index 0000000000000..72bdfbe565bbd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gc-empty-basic-blocks.mir
@@ -0,0 +1,51 @@
+# TODO(boomanaiden154): Remove this and use gc-empty-basic-blocks.ll directly.
+# This should be trivial once we have an asm printer setup for X86 and at least
+# a pipeline skeleton.
+# RUN: llc %s -mtriple=x86_64 -passes=gc-empty-basic-blocks -o - | FileCheck %s
+--- |
+  define void @foo(i1 zeroext %0) #0 {
+    br i1 %0, label %2, label %empty_block
+  
+  2:                                                ; preds = %1
+    %3 = call i32 @baz()
+    br label %4
+  
+  empty_block:                                      ; preds = %1
+    unreachable
+  
+  4:                                                ; preds = %2
+    ret void
+  }
+
+  declare i32 @baz()
+  
+  attributes #0 = { nounwind }
+...
+---
+name:            foo
+alignment:       16
+body:             |
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $edi
+  
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    $al = MOV8rr $dil, implicit killed $edi
+    TEST8ri killed renamable $al, 1, implicit-def $eflags
+    JCC_1 %bb.1, 5, implicit killed $eflags
+    JMP_1 %bb.2
+  
+  bb.1:
+    successors: %bb.3(0x80000000)
+  
+    CALL64pcrel32 target-flags(x86-plt) @baz, csr_64, implicit $rsp, implicit $ssp, implicit-def $eax
+    JMP_1 %bb.3
+  
+  bb.2.empty_block:
+    successors:
+  
+  bb.3:
+    $rax = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    RET64
+...
+# CHECK-NOT: bb.2.empty_block:

>From 1765a953d146b28f7f09878a5b2129c93c0f42ad Mon Sep 17 00:00:00 2001
From: Twice <twice at apache.org>
Date: Sun, 28 Dec 2025 10:55:36 +0800
Subject: [PATCH 24/34] [MLIR][Python] Ensure `_Dialect` is imported for all
 dialects (#173729)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`from ._xxx_ops_gen import _Dialect` appears in some dialect modules,
like builtin, scf, irdl.. but not all of them. This PR ensures that for
upstream dialects, `<dialect module>._Dialect` is availble, like
`arith._Dialect`.

This PR is a prerequisite for the work I’m currently doing. Later on,
I’d like to use these `_Dialect` objects via something like
`conversion_target.add_legal_dialect(arith._Dialect)` (we could of
course just use strings like `add_legal_dialect("arith")`, but compared
to using a defined symbol, I think that’s more prone to typos).
---
 mlir/python/mlir/dialects/amdgpu.py                 | 1 +
 mlir/python/mlir/dialects/async_dialect/__init__.py | 1 +
 mlir/python/mlir/dialects/bufferization.py          | 1 +
 mlir/python/mlir/dialects/cf.py                     | 1 +
 mlir/python/mlir/dialects/complex.py                | 1 +
 mlir/python/mlir/dialects/emitc.py                  | 1 +
 mlir/python/mlir/dialects/index.py                  | 1 +
 mlir/python/mlir/dialects/linalg/__init__.py        | 1 +
 mlir/python/mlir/dialects/llvm.py                   | 1 +
 mlir/python/mlir/dialects/math.py                   | 1 +
 mlir/python/mlir/dialects/memref.py                 | 1 +
 mlir/python/mlir/dialects/nvgpu.py                  | 1 +
 mlir/python/mlir/dialects/nvvm.py                   | 1 +
 mlir/python/mlir/dialects/openacc.py                | 1 +
 mlir/python/mlir/dialects/openmp.py                 | 1 +
 mlir/python/mlir/dialects/python_test.py            | 1 +
 mlir/python/mlir/dialects/rocdl.py                  | 1 +
 mlir/python/mlir/dialects/shape.py                  | 1 +
 mlir/python/mlir/dialects/shard.py                  | 1 +
 mlir/python/mlir/dialects/smt.py                    | 1 +
 mlir/python/mlir/dialects/sparse_tensor.py          | 1 +
 mlir/python/mlir/dialects/spirv.py                  | 1 +
 mlir/python/mlir/dialects/tosa.py                   | 1 +
 mlir/python/mlir/dialects/ub.py                     | 1 +
 mlir/python/mlir/dialects/vector.py                 | 1 +
 25 files changed, 25 insertions(+)

diff --git a/mlir/python/mlir/dialects/amdgpu.py b/mlir/python/mlir/dialects/amdgpu.py
index 1c4d274bc31af..38d8aaf355e4a 100644
--- a/mlir/python/mlir/dialects/amdgpu.py
+++ b/mlir/python/mlir/dialects/amdgpu.py
@@ -3,5 +3,6 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._amdgpu_ops_gen import *
+from ._amdgpu_ops_gen import _Dialect
 from ._amdgpu_enum_gen import *
 from .._mlir_libs._mlirDialectsAMDGPU import *
diff --git a/mlir/python/mlir/dialects/async_dialect/__init__.py b/mlir/python/mlir/dialects/async_dialect/__init__.py
index 6a5ecfc20956c..7f2e4e57211f5 100644
--- a/mlir/python/mlir/dialects/async_dialect/__init__.py
+++ b/mlir/python/mlir/dialects/async_dialect/__init__.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from .._async_ops_gen import *
+from .._async_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/bufferization.py b/mlir/python/mlir/dialects/bufferization.py
index 759b6aa24a9ff..a8139410deff6 100644
--- a/mlir/python/mlir/dialects/bufferization.py
+++ b/mlir/python/mlir/dialects/bufferization.py
@@ -3,4 +3,5 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._bufferization_ops_gen import *
+from ._bufferization_ops_gen import _Dialect
 from ._bufferization_enum_gen import *
diff --git a/mlir/python/mlir/dialects/cf.py b/mlir/python/mlir/dialects/cf.py
index c2e357a8e6565..0b9f4c7ffdb2a 100644
--- a/mlir/python/mlir/dialects/cf.py
+++ b/mlir/python/mlir/dialects/cf.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._cf_ops_gen import *
+from ._cf_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/complex.py b/mlir/python/mlir/dialects/complex.py
index ca81173cfc970..5861d86bc7fbc 100644
--- a/mlir/python/mlir/dialects/complex.py
+++ b/mlir/python/mlir/dialects/complex.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._complex_ops_gen import *
+from ._complex_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/emitc.py b/mlir/python/mlir/dialects/emitc.py
index 99c3286e576f1..94883e2ff7c47 100644
--- a/mlir/python/mlir/dialects/emitc.py
+++ b/mlir/python/mlir/dialects/emitc.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._emitc_ops_gen import *
+from ._emitc_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/index.py b/mlir/python/mlir/dialects/index.py
index 73708c7d71a8c..17fda117cd116 100644
--- a/mlir/python/mlir/dialects/index.py
+++ b/mlir/python/mlir/dialects/index.py
@@ -3,4 +3,5 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._index_ops_gen import *
+from ._index_ops_gen import _Dialect
 from ._index_enum_gen import *
diff --git a/mlir/python/mlir/dialects/linalg/__init__.py b/mlir/python/mlir/dialects/linalg/__init__.py
index c92bda74c12bf..0a97bc03f584b 100644
--- a/mlir/python/mlir/dialects/linalg/__init__.py
+++ b/mlir/python/mlir/dialects/linalg/__init__.py
@@ -9,6 +9,7 @@
 # definitions following these steps:
 #   DSL -> YAML -> tblgen -> pytblgen -> build/.../_linalg_ops_gen.py.
 from .._linalg_ops_gen import *
+from .._linalg_ops_gen import _Dialect
 from .._linalg_enum_gen import *
 from .._linalg_enum_gen import _iteratortypeenum
 
diff --git a/mlir/python/mlir/dialects/llvm.py b/mlir/python/mlir/dialects/llvm.py
index 941a584966dcd..1fd7e64251e61 100644
--- a/mlir/python/mlir/dialects/llvm.py
+++ b/mlir/python/mlir/dialects/llvm.py
@@ -3,6 +3,7 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._llvm_ops_gen import *
+from ._llvm_ops_gen import _Dialect
 from ._llvm_enum_gen import *
 from .._mlir_libs._mlirDialectsLLVM import *
 from ..ir import Value
diff --git a/mlir/python/mlir/dialects/math.py b/mlir/python/mlir/dialects/math.py
index f082bf4615859..5174764438f89 100644
--- a/mlir/python/mlir/dialects/math.py
+++ b/mlir/python/mlir/dialects/math.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._math_ops_gen import *
+from ._math_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/memref.py b/mlir/python/mlir/dialects/memref.py
index c80a1b1a89358..910a2356ca0e4 100644
--- a/mlir/python/mlir/dialects/memref.py
+++ b/mlir/python/mlir/dialects/memref.py
@@ -6,6 +6,7 @@
 from typing import Optional
 
 from ._memref_ops_gen import *
+from ._memref_ops_gen import _Dialect
 from ._ods_common import _dispatch_mixed_values, MixedValues
 from .arith import ConstantOp, _is_integer_like_type
 from ..ir import Value, MemRefType, StridedLayoutAttr, ShapedType, Operation
diff --git a/mlir/python/mlir/dialects/nvgpu.py b/mlir/python/mlir/dialects/nvgpu.py
index d6a54f2772f40..436a144bda79e 100644
--- a/mlir/python/mlir/dialects/nvgpu.py
+++ b/mlir/python/mlir/dialects/nvgpu.py
@@ -3,5 +3,6 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._nvgpu_ops_gen import *
+from ._nvgpu_ops_gen import _Dialect
 from ._nvgpu_enum_gen import *
 from .._mlir_libs._mlirDialectsNVGPU import *
diff --git a/mlir/python/mlir/dialects/nvvm.py b/mlir/python/mlir/dialects/nvvm.py
index 9477de39c9ead..80bc9838541ba 100644
--- a/mlir/python/mlir/dialects/nvvm.py
+++ b/mlir/python/mlir/dialects/nvvm.py
@@ -3,4 +3,5 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._nvvm_ops_gen import *
+from ._nvvm_ops_gen import _Dialect
 from ._nvvm_enum_gen import *
diff --git a/mlir/python/mlir/dialects/openacc.py b/mlir/python/mlir/dialects/openacc.py
index 057f71aed20a6..4071ca27dbe40 100644
--- a/mlir/python/mlir/dialects/openacc.py
+++ b/mlir/python/mlir/dialects/openacc.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._acc_ops_gen import *
+from ._acc_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/openmp.py b/mlir/python/mlir/dialects/openmp.py
index 604f0bd03e932..879bbc61d797f 100644
--- a/mlir/python/mlir/dialects/openmp.py
+++ b/mlir/python/mlir/dialects/openmp.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._omp_ops_gen import *
+from ._omp_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/python_test.py b/mlir/python/mlir/dialects/python_test.py
index 56d3c0f7a5465..f0bfcd8b68b35 100644
--- a/mlir/python/mlir/dialects/python_test.py
+++ b/mlir/python/mlir/dialects/python_test.py
@@ -3,6 +3,7 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._python_test_ops_gen import *
+from ._python_test_ops_gen import _Dialect
 
 
 def register_python_test_dialect(registry):
diff --git a/mlir/python/mlir/dialects/rocdl.py b/mlir/python/mlir/dialects/rocdl.py
index aa47cb4b55792..e4e604b5501d9 100644
--- a/mlir/python/mlir/dialects/rocdl.py
+++ b/mlir/python/mlir/dialects/rocdl.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._rocdl_ops_gen import *
+from ._rocdl_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/shape.py b/mlir/python/mlir/dialects/shape.py
index cc987ac843e75..c3966ff28d3a6 100644
--- a/mlir/python/mlir/dialects/shape.py
+++ b/mlir/python/mlir/dialects/shape.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._shape_ops_gen import *
+from ._shape_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/shard.py b/mlir/python/mlir/dialects/shard.py
index 8d69f17954290..b5e5b475c5d90 100644
--- a/mlir/python/mlir/dialects/shard.py
+++ b/mlir/python/mlir/dialects/shard.py
@@ -3,4 +3,5 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._shard_ops_gen import *
+from ._shard_ops_gen import _Dialect
 from ._shard_enum_gen import *
diff --git a/mlir/python/mlir/dialects/smt.py b/mlir/python/mlir/dialects/smt.py
index 38970d17abd47..c9227d7bcf443 100644
--- a/mlir/python/mlir/dialects/smt.py
+++ b/mlir/python/mlir/dialects/smt.py
@@ -3,6 +3,7 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._smt_ops_gen import *
+from ._smt_ops_gen import _Dialect
 from ._smt_enum_gen import *
 
 from .._mlir_libs._mlirDialectsSMT import *
diff --git a/mlir/python/mlir/dialects/sparse_tensor.py b/mlir/python/mlir/dialects/sparse_tensor.py
index 209ecc95fa8fc..8f8167715c458 100644
--- a/mlir/python/mlir/dialects/sparse_tensor.py
+++ b/mlir/python/mlir/dialects/sparse_tensor.py
@@ -3,6 +3,7 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._sparse_tensor_ops_gen import *
+from ._sparse_tensor_ops_gen import _Dialect
 from ._sparse_tensor_enum_gen import *
 from .._mlir_libs._mlirDialectsSparseTensor import *
 from .._mlir_libs import _mlirSparseTensorPasses as _cextSparseTensorPasses
diff --git a/mlir/python/mlir/dialects/spirv.py b/mlir/python/mlir/dialects/spirv.py
index 269678a2032eb..7cdfdb4c0092b 100644
--- a/mlir/python/mlir/dialects/spirv.py
+++ b/mlir/python/mlir/dialects/spirv.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._spirv_ops_gen import *
+from ._spirv_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/tosa.py b/mlir/python/mlir/dialects/tosa.py
index aebda742fde40..f2c1f99d222cb 100644
--- a/mlir/python/mlir/dialects/tosa.py
+++ b/mlir/python/mlir/dialects/tosa.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._tosa_ops_gen import *
+from ._tosa_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/ub.py b/mlir/python/mlir/dialects/ub.py
index 32e8706745302..101846f5e6bb7 100644
--- a/mlir/python/mlir/dialects/ub.py
+++ b/mlir/python/mlir/dialects/ub.py
@@ -3,3 +3,4 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._ub_ops_gen import *
+from ._ub_ops_gen import _Dialect
diff --git a/mlir/python/mlir/dialects/vector.py b/mlir/python/mlir/dialects/vector.py
index 7384e9a5aeef2..f01192e12199c 100644
--- a/mlir/python/mlir/dialects/vector.py
+++ b/mlir/python/mlir/dialects/vector.py
@@ -3,4 +3,5 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from ._vector_ops_gen import *
+from ._vector_ops_gen import _Dialect
 from ._vector_enum_gen import *

>From a7d8b88d1818d9eeb4e21de80f09719ca7ecb51a Mon Sep 17 00:00:00 2001
From: Yunbo Ni <87902024+cardigan1008 at users.noreply.github.com>
Date: Sun, 28 Dec 2025 18:33:23 +0800
Subject: [PATCH 25/34] [InstCombine] Add check for flag propagation in
 `foldSelectIntoOp` (#173735)

Fixes
https://github.com/llvm/llvm-project/pull/162003#issuecomment-3693943568.

The current flag propagation assumes that if a select has both `ninf`
and `nnan`, then the operands of the folded operation must be finite.
While this assumption holds for `fadd`, `fsub`, and `fmul`, it does not
hold for `fdiv`.

For example, assume we have:

```
A = 1.0, B = +Inf
A / B = 0.0  (finite, non-NaN)
```

The current transform would turn `fdiv A, B; select ninf nnan cond, A/B,
A;` into `A / (select ninf nnan cond, B, 1.0)`. If `cond` is true, the
inner select returns `B = +Inf`, and due to the propagated `ninf`, this
becomes poison.

This patch add check for operators before flag propagation to avoid
`fdiv` cases.

Alive2: https://alive2.llvm.org/ce/z/o0MJmS
---
 llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp |  9 ++++++++-
 .../select-binop-foldable-floating-point.ll           | 11 +++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index f52bac5e600cb..67d1845832725 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -555,8 +555,15 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
       // Examples: -inf + +inf = NaN, -inf - -inf = NaN, 0 * inf = NaN
       // Specifically, if the original select has both ninf and nnan, we can
       // safely propagate the flag.
+      // Note: This property holds for fadd, fsub, and fmul, but does not
+      // hold for fdiv (e.g. A / Inf == 0.0).
+      bool CanInferFiniteOperandsFromResult =
+          TVI->getOpcode() == Instruction::FAdd ||
+          TVI->getOpcode() == Instruction::FSub ||
+          TVI->getOpcode() == Instruction::FMul;
       NewSelFMF.setNoInfs(TVI->hasNoInfs() ||
-                          (NewSelFMF.noInfs() && NewSelFMF.noNaNs()));
+                          (CanInferFiniteOperandsFromResult &&
+                           NewSelFMF.noInfs() && NewSelFMF.noNaNs()));
       cast<Instruction>(NewSel)->setFastMathFlags(NewSelFMF);
     }
     NewSel->takeName(TVI);
diff --git a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
index c14dd469f1a6e..83fa28a406f75 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
@@ -409,3 +409,14 @@ define float @select_nnan_fdiv_invalid(i1 %cond, float %A, float %B) {
   %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
+
+define float @select_fpclass_fdiv_nnan_ninf(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fdiv_nnan_ninf(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fdiv float [[A:%.*]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fdiv float %A, %B
+  %D = select ninf nnan i1 %cond, float %C, float %A
+  ret float %D
+}

>From 1e985b6ddf023af5782d48c1cce881668fdf6ceb Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856 at qq.com>
Date: Sun, 28 Dec 2025 18:42:45 +0800
Subject: [PATCH 26/34] [AVR] Explicitly set flag 'hasSideEffects' of
 instructions (#173660)

---
 llvm/lib/Target/AVR/AVRInstrFormats.td |   3 +-
 llvm/lib/Target/AVR/AVRInstrInfo.td    | 246 +++++++++++++------------
 2 files changed, 129 insertions(+), 120 deletions(-)

diff --git a/llvm/lib/Target/AVR/AVRInstrFormats.td b/llvm/lib/Target/AVR/AVRInstrFormats.td
index 72ea3bc1f460d..c721d655fec7c 100644
--- a/llvm/lib/Target/AVR/AVRInstrFormats.td
+++ b/llvm/lib/Target/AVR/AVRInstrFormats.td
@@ -531,6 +531,7 @@ class FSK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
 class ExtensionPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
     : Pseudo<outs, ins, asmstr, pattern> {
   let Defs = [SREG];
+  let hasSideEffects = 0;
 }
 
 class StorePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
@@ -548,6 +549,6 @@ class SelectPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
 class ShiftPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
     : Pseudo<outs, ins, asmstr, pattern> {
   let usesCustomInserter = 1;
-
+  let hasSideEffects = 0;
   let Defs = [SREG];
 }
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index b5f13c055a860..1080d41c1e540 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -361,7 +361,8 @@ let Defs = [SP, SREG], Uses = [SP] in {
 //===----------------------------------------------------------------------===//
 // Addition
 //===----------------------------------------------------------------------===//
-let isCommutable = 1, Constraints = "$src = $rd", Defs = [SREG] in {
+let hasSideEffects = 0, isCommutable = 1, Constraints = "$src = $rd",
+    Defs = [SREG] in {
   // ADD Rd, Rr
   // Adds two 8-bit registers.
   def ADDRdRr : FRdRr<0b0000, 0b11, (outs GPR8:$rd),(ins GPR8:$src, GPR8:$rr),
@@ -408,7 +409,7 @@ let isCommutable = 1, Constraints = "$src = $rd", Defs = [SREG] in {
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
-let Constraints = "$rs = $rd", Defs = [SREG] in {
+let hasSideEffects = 0, Constraints = "$rs = $rd", Defs = [SREG] in {
   // SUB Rd, Rr
   // Subtracts the 8-bit value of Rr from Rd and places the value in Rd.
   def SUBRdRr : FRdRr<0b0001, 0b10, (outs GPR8:$rd), (ins GPR8:$rs, GPR8:$rr),
@@ -474,7 +475,7 @@ let Constraints = "$rs = $rd", Defs = [SREG] in {
 //===----------------------------------------------------------------------===//
 // Increment and Decrement
 //===----------------------------------------------------------------------===//
-let Constraints = "$src = $rd", Defs = [SREG] in {
+let hasSideEffects = 0, Constraints = "$src = $rd", Defs = [SREG] in {
   def INCRd : FRd<0b1001, 0b0100011, (outs GPR8:$rd), (ins GPR8:$src),
                   "inc\t$rd",
                   [(set i8:$rd, (add i8:$src, 1))]>;
@@ -488,7 +489,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
 // Multiplication
 //===----------------------------------------------------------------------===//
 
-let isCommutable = 1, Defs = [R1, R0, SREG] in {
+let hasSideEffects = 0, isCommutable = 1, Defs = [R1, R0, SREG] in {
   // MUL Rd, Rr
   // Multiplies Rd by Rr and places the result into R1:R0.
   let usesCustomInserter = 1 in {
@@ -518,14 +519,15 @@ let isCommutable = 1, Defs = [R1, R0, SREG] in {
                Requires<[SupportsMultiplication]>;
 }
 
-let Defs =
+// Set hasSideEffects = 1 to prevent it is considered dead.
+let hasSideEffects = 1, Defs =
     [R15, R14, R13, R12, R11, R10, R9, R8, R7, R6, R5, R4, R3, R2, R1, R0] in
 def DESK : FDES<(outs), (ins i8imm:$k), "des\t$k", []>, Requires<[HasDES]>;
 
 //===----------------------------------------------------------------------===//
 // Logic
 //===----------------------------------------------------------------------===//
-let Constraints = "$src = $rd", Defs = [SREG] in {
+let hasSideEffects = 0, Constraints = "$src = $rd", Defs = [SREG] in {
   // Register-Register logic instructions (which have the
   // property of commutativity).
   let isCommutable = 1 in {
@@ -599,7 +601,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
 //===----------------------------------------------------------------------===//
 // One's/Two's Complement
 //===----------------------------------------------------------------------===//
-let Constraints = "$src = $rd", Defs = [SREG] in {
+let hasSideEffects = 0, Constraints = "$src = $rd", Defs = [SREG] in {
   def COMRd : FRd<0b1001, 0b0100000, (outs GPR8:$rd), (ins GPR8:$src),
                   "com\t$rd", [(set i8:$rd, (not i8:$src))]>;
 
@@ -620,7 +622,6 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
   // neg Rd+1
   // neg Rd
   // sbc Rd+1, r1
-  let hasSideEffects=0 in
   def NEGWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, GPR8:$zero),
                       "negw\t$rd", []>;
 }
@@ -701,7 +702,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 //===----------------------------------------------------------------------===//
 // Compare operations.
 //===----------------------------------------------------------------------===//
-let Defs = [SREG] in {
+let hasSideEffects = 0, Defs = [SREG] in {
   // CPSE Rd, Rr
   // Compare Rd and Rr, skipping the next instruction if they are equal.
   let isBarrier = 1, isBranch = 1, isTerminator = 1 in
@@ -861,7 +862,7 @@ let hasSideEffects = 0 in {
 }
 
 // Load immediate values into registers.
-let isReMaterializable = 1 in {
+let hasSideEffects = 0, isReMaterializable = 1 in {
   def LDIRdK : FRdK<0b1110, (outs LD8:$rd), (ins imm_ldi8:$k), "ldi\t$rd, $k",
                     [(set i8:$rd, imm:$k)]>;
 
@@ -875,7 +876,7 @@ let isReMaterializable = 1 in {
 }
 
 // Load from data space into register.
-let mayLoad = 1, isReMaterializable = 1 in {
+let hasSideEffects = 0, mayLoad = 1, isReMaterializable = 1 in {
   def LDSRdK : F32DM<0b0, (outs GPR8:$rd), (ins imm16:$k), "lds\t$rd, $k",
                      [(set i8:$rd, (load imm:$k))]>,
                Requires<[HasSRAM, HasNonTinyEncoding]>;
@@ -896,113 +897,114 @@ let mayLoad = 1, isReMaterializable = 1 in {
 }
 
 // Indirect loads.
-let mayLoad = 1, isReMaterializable = 1 in {
-  def LDRdPtr : FSTLD<0, 0b00, (outs GPR8:$reg), (ins PTRREGS:$ptrreg),
-                      "ld\t$reg, $ptrreg",
-                      [(set GPR8:$reg, (load i16:$ptrreg))]>,
-                Requires<[HasSRAM]>;
-
-  // LDW Rd+1:Rd, P
-  //
-  // Expands to:
-  //   ld  Rd,   P
-  //   ldd Rd+1, P+1
-  // On reduced tiny cores, this instruction expands to:
-  //   ld    Rd,   P+
-  //   ld    Rd+1, P+
-  //   subiw P,    2
-  let Constraints = "@earlyclobber $reg" in
-  def LDWRdPtr : Pseudo<(outs DREGS:$reg), (ins PTRDISPREGS:$ptrreg),
-                        "ldw\t$reg, $ptrreg",
-                        [(set i16:$reg, (load i16:$ptrreg))]>,
-                 Requires<[HasSRAM]>;
-}
-
-// Indirect loads (with postincrement or predecrement).
-let mayLoad = 1, hasSideEffects = 0,
-    Constraints = "$ptrreg = $base_wb, at earlyclobber $reg" in {
-  def LDRdPtrPi : FSTLD<0, 0b01, (outs GPR8:$reg, PTRREGS:$base_wb),
-                        (ins PTRREGS:$ptrreg), "ld\t$reg, $ptrreg+", []>,
+let hasSideEffects = 0, mayLoad = 1 in {
+  let isReMaterializable = 1 in {
+    def LDRdPtr : FSTLD<0, 0b00, (outs GPR8:$reg), (ins PTRREGS:$ptrreg),
+                        "ld\t$reg, $ptrreg",
+                        [(set GPR8:$reg, (load i16:$ptrreg))]>,
                   Requires<[HasSRAM]>;
 
-  // LDW Rd+1:Rd, P+
-  // Expands to:
-  // ld Rd,   P+
-  // ld Rd+1, P+
-  def LDWRdPtrPi : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
-                          (ins PTRREGS:$ptrreg), "ldw\t$reg, $ptrreg+", []>,
+    // LDW Rd+1:Rd, P
+    //
+    // Expands to:
+    //   ld  Rd,   P
+    //   ldd Rd+1, P+1
+    // On reduced tiny cores, this instruction expands to:
+    //   ld    Rd,   P+
+    //   ld    Rd+1, P+
+    //   subiw P,    2
+    let Constraints = "@earlyclobber $reg" in
+    def LDWRdPtr : Pseudo<(outs DREGS:$reg), (ins PTRDISPREGS:$ptrreg),
+                          "ldw\t$reg, $ptrreg",
+                          [(set i16:$reg, (load i16:$ptrreg))]>,
                    Requires<[HasSRAM]>;
+  }
 
-  def LDRdPtrPd : FSTLD<0, 0b10, (outs GPR8:$reg, PTRREGS:$base_wb),
-                        (ins PTRREGS:$ptrreg), "ld\t$reg, -$ptrreg", []>,
-                  Requires<[HasSRAM]>;
+  // Indirect loads (with postincrement or predecrement).
+  let isReMaterializable = 0,
+      Constraints = "$ptrreg = $base_wb, at earlyclobber $reg" in {
+    def LDRdPtrPi : FSTLD<0, 0b01, (outs GPR8:$reg, PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg), "ld\t$reg, $ptrreg+", []>,
+                    Requires<[HasSRAM]>;
 
-  // LDW Rd+1:Rd, -P
-  //
-  // Expands to:
-  // ld Rd+1, -P
-  // ld Rd,   -P
-  def LDWRdPtrPd : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
-                          (ins PTRREGS:$ptrreg), "ldw\t$reg, -$ptrreg", []>,
-                   Requires<[HasSRAM]>;
-}
+    // LDW Rd+1:Rd, P+
+    // Expands to:
+    // ld Rd,   P+
+    // ld Rd+1, P+
+    def LDWRdPtrPi : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+                            (ins PTRREGS:$ptrreg), "ldw\t$reg, $ptrreg+", []>,
+                     Requires<[HasSRAM]>;
 
-// Load indirect with displacement operations.
-let mayLoad = 1, isReMaterializable = 1 in {
-  def LDDRdPtrQ : FSTDLDD<0, (outs GPR8:$reg), (ins memri:$memri),
-                          "ldd\t$reg, $memri",
-                          [(set i8:$reg, (load addr:$memri))]>,
-                  Requires<[HasSRAM, HasNonTinyEncoding]>;
+    def LDRdPtrPd : FSTLD<0, 0b10, (outs GPR8:$reg, PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg), "ld\t$reg, -$ptrreg", []>,
+                    Requires<[HasSRAM]>;
 
-  // LDDW Rd+1:Rd, P+q
-  //
-  // Expands to:
-  //   ldd Rd,   P+q
-  //   ldd Rd+1, P+q+1
-  // On reduced tiny cores, this instruction expands to:
-  //   subiw P,    -q
-  //   ld    Rd,   P+
-  //   ld    Rd+1, P+
-  //   subiw P,    q+2
-  let Constraints = "@earlyclobber $dst" in
-  def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri),
-                          "lddw\t$dst, $memri",
-                          [(set i16:$dst, (load addr:$memri))]>,
-                   Requires<[HasSRAM]>;
+    // LDW Rd+1:Rd, -P
+    //
+    // Expands to:
+    // ld Rd+1, -P
+    // ld Rd,   -P
+    def LDWRdPtrPd : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+                            (ins PTRREGS:$ptrreg), "ldw\t$reg, -$ptrreg", []>,
+                     Requires<[HasSRAM]>;
+  }
 
-  // An identical pseudo instruction to LDDWRdPtrQ, expect restricted to the Y
-  // register and without the @earlyclobber flag.
-  //
-  // Used to work around a bug caused by the register allocator not
-  // being able to handle the expansion of a COPY into an machine instruction
-  // that has an earlyclobber flag. This is because the register allocator will
-  // try expand a copy from a register slot into an earlyclobber instruction.
-  // Instructions that are earlyclobber need to be in a dedicated earlyclobber
-  // slot.
-  //
-  // This pseudo instruction can be used pre-AVR pseudo expansion in order to
-  // get a frame index load without directly using earlyclobber instructions.
-  //
-  // The pseudo expansion pass trivially expands this into LDDWRdPtrQ.
-  //
-  // This instruction may be removed once PR13375 is fixed.
-  let hasSideEffects = 0 in
-  def LDDWRdYQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri),
-                        "lddw\t$dst, $memri", []>,
-                 Requires<[HasSRAM]>;
+  // Load indirect with displacement operations.
+  let isReMaterializable = 1 in {
+    def LDDRdPtrQ : FSTDLDD<0, (outs GPR8:$reg), (ins memri:$memri),
+                            "ldd\t$reg, $memri",
+                            [(set i8:$reg, (load addr:$memri))]>,
+                    Requires<[HasSRAM, HasNonTinyEncoding]>;
+
+    // LDDW Rd+1:Rd, P+q
+    //
+    // Expands to:
+    //   ldd Rd,   P+q
+    //   ldd Rd+1, P+q+1
+    // On reduced tiny cores, this instruction expands to:
+    //   subiw P,    -q
+    //   ld    Rd,   P+
+    //   ld    Rd+1, P+
+    //   subiw P,    q+2
+    let Constraints = "@earlyclobber $dst" in
+    def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri),
+                            "lddw\t$dst, $memri",
+                            [(set i16:$dst, (load addr:$memri))]>,
+                     Requires<[HasSRAM]>;
+
+    // An identical pseudo instruction to LDDWRdPtrQ, expect restricted to the Y
+    // register and without the @earlyclobber flag.
+    //
+    // Used to work around a bug caused by the register allocator not
+    // being able to handle the expansion of a COPY into an machine instruction
+    // that has an earlyclobber flag. This is because the register allocator will
+    // try expand a copy from a register slot into an earlyclobber instruction.
+    // Instructions that are earlyclobber need to be in a dedicated earlyclobber
+    // slot.
+    //
+    // This pseudo instruction can be used pre-AVR pseudo expansion in order to
+    // get a frame index load without directly using earlyclobber instructions.
+    //
+    // The pseudo expansion pass trivially expands this into LDDWRdPtrQ.
+    //
+    // This instruction may be removed once PR13375 is fixed.
+    def LDDWRdYQ : Pseudo<(outs DREGS:$dst), (ins memri:$memri),
+                          "lddw\t$dst, $memri", []>,
+                   Requires<[HasSRAM]>;
+  }
 }
 
-let mayLoad = 1, isReMaterializable = 1 in
+let hasSideEffects = 0, mayLoad = 1, isReMaterializable = 1 in
 class AtomicLoad<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
     : Pseudo<(outs DRC:$rd), (ins PTRRC:$rr), "atomic_op",
              [(set DRC:$rd, (Op i16:$rr))]>;
 
-let mayStore = 1 in
+let hasSideEffects = 0, mayStore = 1 in
 class AtomicStore<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
     : Pseudo<(outs), (ins PTRRC:$rd, DRC:$rr), "atomic_op",
              [(Op DRC:$rr, i16:$rd)]>;
 
-let mayLoad = 1, mayStore = 1 in
+let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in
 class AtomicLoadOp<PatFrag Op, RegisterClass DRC, RegisterClass PTRRC>
     : Pseudo<(outs DRC:$rd), (ins PTRRC:$rr, DRC:$operand), "atomic_op",
              [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>;
@@ -1025,7 +1027,7 @@ def AtomicStore16 : AtomicStore<atomic_store_16, DREGS, PTRDISPREGS>;
 class AtomicLoadOp8<PatFrag Op> : AtomicLoadOp<Op, GPR8, PTRREGS>;
 class AtomicLoadOp16<PatFrag Op> : AtomicLoadOp<Op, DREGS, PTRDISPREGS>;
 
-let usesCustomInserter=1 in {
+let usesCustomInserter = 1, Defs = [SREG] in {
   def AtomicLoadAdd8 : AtomicLoadOp8<atomic_load_add_i8>;
   def AtomicLoadAdd16 : AtomicLoadOp16<atomic_load_add_i16>;
   def AtomicLoadSub8 : AtomicLoadOp8<atomic_load_sub_i8>;
@@ -1038,11 +1040,12 @@ let usesCustomInserter=1 in {
   def AtomicLoadXor16 : AtomicLoadOp16<atomic_load_xor_i16>;
 }
 
+let hasSideEffects = 1 in
 def AtomicFence
     : Pseudo<(outs), (ins), "atomic_fence", [(atomic_fence timm, timm)]>;
 
 // Indirect store from register to data space.
-let mayStore = 1 in {
+let hasSideEffects = 0, mayStore = 1 in {
   def STSKRr : F32DM<0b1, (outs), (ins imm16:$k, GPR8:$rd), "sts\t$k, $rd",
                      [(store i8:$rd, imm:$k)]>,
                Requires<[HasSRAM, HasNonTinyEncoding]>;
@@ -1058,7 +1061,7 @@ let mayStore = 1 in {
 // Expands to:
 // sts Rr+1, (K+1:K) + 1
 // sts Rr,   (K+1:K)
-let mayStore = 1 in
+let hasSideEffects = 0, mayStore = 1 in
 def STSWKRr : Pseudo<(outs), (ins i16imm:$dst, DREGS:$src),
                      "stsw\t$dst, $src", [(store i16:$src, imm:$dst)]>,
               Requires<[HasSRAM, HasNonTinyEncoding]>;
@@ -1066,7 +1069,7 @@ def STSWKRr : Pseudo<(outs), (ins i16imm:$dst, DREGS:$src),
 // Indirect stores.
 // ST P, Rr
 // Stores the value of Rr into the location addressed by pointer P.
-let mayStore = 1 in
+let hasSideEffects = 0, mayStore = 1 in
 def STPtrRr : FSTLD<1, 0b00, (outs), (ins PTRREGS:$ptrreg, GPR8:$reg),
                     "st\t$ptrreg, $reg", [(store GPR8:$reg, i16:$ptrreg)]>,
               Requires<[HasSRAM]>;
@@ -1081,14 +1084,14 @@ def STPtrRr : FSTLD<1, 0b00, (outs), (ins PTRREGS:$ptrreg, GPR8:$reg),
 //   st    P+, Rr
 //   st    P+, Rr+1
 //   subiw P,  q+2
-let mayStore = 1 in
+let hasSideEffects = 0, mayStore = 1 in
 def STWPtrRr : Pseudo<(outs), (ins PTRDISPREGS:$ptrreg, DREGS:$reg),
                       "stw\t$ptrreg, $reg", [(store i16:$reg, i16:$ptrreg)]>,
                Requires<[HasSRAM]>;
 
 // Indirect stores (with postincrement or predecrement).
-let mayStore = 1, Constraints = "$ptrreg = $base_wb, at earlyclobber $base_wb" in {
-
+let hasSideEffects = 0, mayStore = 1,
+    Constraints = "$ptrreg = $base_wb, at earlyclobber $base_wb" in {
   // ST P+, Rr
   // Stores the value of Rr into the location addressed by pointer P.
   // Post increments P.
@@ -1143,7 +1146,7 @@ let mayStore = 1, Constraints = "$ptrreg = $base_wb, at earlyclobber $base_wb" in {
 // STD P+q, Rr
 // Stores the value of Rr into the location addressed by pointer P with a
 // displacement of q. Does not modify P.
-let mayStore = 1 in
+let hasSideEffects = 0, mayStore = 1 in
 def STDPtrQRr : FSTDLDD<1, (outs), (ins memri:$memri, GPR8:$reg),
                         "std\t$memri, $reg", [(store i8:$reg, addr:$memri)]>,
                 Requires<[HasSRAM, HasNonTinyEncoding]>;
@@ -1160,7 +1163,7 @@ def STDPtrQRr : FSTDLDD<1, (outs), (ins memri:$memri, GPR8:$reg),
 //   st    P+, Rr
 //   st    P+, Rr+1
 //   subiw P,  q+2
-let mayStore = 1 in
+let hasSideEffects = 0, mayStore = 1 in
 def STDWPtrQRr : Pseudo<(outs), (ins memri:$memri, DREGS:$src),
                         "stdw\t$memri, $src", [(store i16:$src, addr:$memri)]>,
                  Requires<[HasSRAM]>;
@@ -1226,7 +1229,7 @@ let mayLoad = 1, hasSideEffects = 0 in {
   }
 
   // These pseudos are combination of the OUT and ELPM instructions.
-  let Defs = [R31R30], hasSideEffects = 1 in {
+  let Defs = [R31R30], mayStore = 1 in {
     def ELPMBRdZPi : Pseudo<(outs GPR8:$dst), (ins ZREG:$z, LD8:$p),
                             "elpmb\t$dst, $z+, $p", []>,
                      Requires<[HasELPMX]>;
@@ -1238,12 +1241,12 @@ let mayLoad = 1, hasSideEffects = 0 in {
 }
 
 // Store program memory operations.
-let Uses = [R1, R0], mayStore = 1 in {
+let Uses = [R1, R0], mayStore = 1, hasSideEffects = 0 in {
   let Uses = [R31R30] in
   def SPM : F16<0b1001010111101000, (outs), (ins), "spm", []>,
             Requires<[HasSPM]>;
 
-  let Defs = [R31R30] in 
+  let Defs = [R31R30] in
   def SPMZPi : F16<0b1001010111111000, (outs), (ins ZREG:$z), "spm $z+", []>,
                Requires<[HasSPMX]> {
     bits<0> z;
@@ -1251,7 +1254,7 @@ let Uses = [R1, R0], mayStore = 1 in {
 }
 
 // Read data from IO location operations.
-let mayLoad = 1 in {
+let hasSideEffects = 1, mayLoad = 1 in {
   def INRdA : FIORdA<(outs GPR8:$rd), (ins imm_port6:$A), "in\t$rd, $A",
                      [(set i8:$rd, (load ioaddr8:$A))]>;
 
@@ -1260,7 +1263,7 @@ let mayLoad = 1 in {
 }
 
 // Write data to IO location operations.
-let mayStore = 1 in {
+let hasSideEffects = 1, mayStore = 1 in {
   def OUTARr : FIOARr<(outs), (ins imm_port6:$A, GPR8:$rr), "out\t$A, $rr",
                       [(store i8:$rr, ioaddr8:$A)]>;
 
@@ -1291,7 +1294,7 @@ let Defs = [SP], Uses = [SP], hasSideEffects = 0 in {
 }
 
 // Read-Write-Modify (RMW) instructions.
-let mayLoad = 1, mayStore = 1 in {
+let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in {
   def XCHZRd : FZRd<0b100, (outs GPR8:$rd), (ins ZREG:$z), "xch\t$z, $rd", []>,
                Requires<[SupportsRMW]>;
 
@@ -1310,7 +1313,7 @@ let mayLoad = 1, mayStore = 1 in {
 //===----------------------------------------------------------------------===//
 
 // Bit shift/rotate operations.
-let Constraints = "$src = $rd", Defs = [SREG] in {
+let hasSideEffects = 0, Constraints = "$src = $rd", Defs = [SREG] in {
   // 8-bit LSL is an alias of ADD Rd, Rd
 
   def LSLWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lslw\t$rd",
@@ -1389,14 +1392,14 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
 
 // SWAP Rd
 // Swaps the high and low nibbles in a register.
-let Constraints = "$src = $rd" in
+let hasSideEffects = 0, Constraints = "$src = $rd" in
 def SWAPRd : FRd<0b1001, 0b0100010, (outs GPR8:$rd), (ins GPR8:$src),
                  "swap\t$rd", [(set i8:$rd, (AVRSwap i8:$src))]>;
 
 // IO register bit set/clear operations.
 //: TODO: add patterns when popcount(imm)==2 to be expanded with 2 sbi/cbi
 // instead of in+ori+out which requires one more instr.
-let mayStore = 1 in {
+let hasSideEffects = 1, mayStore = 1 in {
   def SBIAb : FIOBIT<0b10, (outs), (ins imm_port5:$addr, i8imm:$b),
                      "sbi\t$addr, $b",
                      [(store(or(i8(load lowioaddr8:$addr)), iobitpos8:$b),
@@ -1483,6 +1486,7 @@ def : InstAlias<"cli", (BCLRs 7)>;
 // Breakpoint instruction
 // ---------
 // <|1001|0101|1001|1000>
+let hasSideEffects = 1 in
 def BREAK : F16<0b1001010110011000, (outs), (ins), "break", []>,
             Requires<[HasBREAK]>;
 
@@ -1490,18 +1494,22 @@ def BREAK : F16<0b1001010110011000, (outs), (ins), "break", []>,
 // No-operation instruction
 // ---------
 // <|0000|0000|0000|0000>
+// Set `hasSideEffects = 1` in case it is considered dead.
+let hasSideEffects = 1 in
 def NOP : F16<0b0000000000000000, (outs), (ins), "nop", []>;
 
 // SLEEP
 // Sleep instruction
 // ---------
 // <|1001|0101|1000|1000>
+let hasSideEffects = 1 in
 def SLEEP : F16<0b1001010110001000, (outs), (ins), "sleep", []>;
 
 // WDR
 // Watchdog reset
 // ---------
 // <|1001|0101|1010|1000>
+let hasSideEffects = 1 in
 def WDR : F16<0b1001010110101000, (outs), (ins), "wdr", []>;
 
 //===----------------------------------------------------------------------===//
@@ -1625,7 +1633,7 @@ def Asr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
                           (AVRasrw i16:$srclo, i16:$srchi, i8:$cnt))]>;
 
 // lowered to a copy from the zero register.
-let usesCustomInserter=1 in
+let usesCustomInserter = 1, hasSideEffects = 0 in
 def CopyZero : Pseudo<(outs GPR8:$rd), (ins), "clrz\t$rd", [(set i8:$rd, 0)]>;
 
 //===----------------------------------------------------------------------===//

>From 56ddbb7560d82bf2cf1bcb79f1dac72eee0e8f52 Mon Sep 17 00:00:00 2001
From: Tobias Gysi <tobias.gysi at nextsilicon.com>
Date: Sun, 28 Dec 2025 16:46:49 +0100
Subject: [PATCH 27/34] [mlir][llvm] Use the roundtrip flag in test (#173782)

This commit changes the debug info test to use the `--verify-roundtrip`
flag. The byte code rountrip has been problematic before. Recent changes
to the attribute parsing make it possible to use the flag.

This closes https://github.com/llvm/llvm-project/issues/163004
---
 mlir/test/Dialect/LLVMIR/debuginfo.mlir | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/test/Dialect/LLVMIR/debuginfo.mlir b/mlir/test/Dialect/LLVMIR/debuginfo.mlir
index d7bf99bfaed7f..545715f73f449 100644
--- a/mlir/test/Dialect/LLVMIR/debuginfo.mlir
+++ b/mlir/test/Dialect/LLVMIR/debuginfo.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
-// RUN: mlir-opt -emit-bytecode %s | mlir-opt | FileCheck %s
+// RUN: mlir-opt %s --verify-roundtrip | mlir-opt | FileCheck %s
 
 // CHECK-DAG: #[[FILE:.*]] = #llvm.di_file<"debuginfo.mlir" in "/test/">
 #file = #llvm.di_file<"debuginfo.mlir" in "/test/">

>From 2c0565fcff7ae0d6c1bbca85c588420a155ddc5a Mon Sep 17 00:00:00 2001
From: sskzakaria <ssskzakaria at proton.me>
Date: Sun, 28 Dec 2025 10:47:03 -0500
Subject: [PATCH 28/34]   [X86][Clang] VectorExprEvaluator::VisitCallExpr /
 InterpretBuiltin - allow AVX512 mask predicate intrinsics to be used in
 constexpr (#173739)

Enables constexpr evaluation for the following AVX512 Integer Comparison Intrinsics:
```
_mm512_kmov

_mm_movm_epi8 _mm256_movm_epi8 _mm512_movm_epi8
_mm_movm_epi16 _mm256_movm_epi16 _mm512_movm_epi16
_mm_movm_epi32 _mm256_movm_epi32 _mm512_movm_epi32
_mm_movm_epi64 _mm256_movm_epi64 _mm512_movm_epi64
```
FIXES #162072
---
 clang/include/clang/Basic/BuiltinsX86.td     | 24 ++++++++-----
 clang/lib/AST/ByteCode/InterpBuiltin.cpp     | 38 ++++++++++++++++++++
 clang/lib/AST/ExprConstant.cpp               | 35 ++++++++++++++++++
 clang/lib/Headers/avx512bwintrin.h           | 10 +++---
 clang/lib/Headers/avx512dqintrin.h           | 10 +++---
 clang/lib/Headers/avx512fintrin.h            |  5 ++-
 clang/lib/Headers/avx512vlbwintrin.h         | 20 +++++------
 clang/lib/Headers/avx512vldqintrin.h         | 20 +++++------
 clang/test/CodeGen/X86/avx512bw-builtins.c   | 26 ++++++++------
 clang/test/CodeGen/X86/avx512dq-builtins.c   |  5 +++
 clang/test/CodeGen/X86/avx512f-builtins.c    |  2 ++
 clang/test/CodeGen/X86/avx512vlbw-builtins.c | 13 +++++++
 clang/test/CodeGen/X86/avx512vldq-builtins.c | 10 ++++++
 13 files changed, 161 insertions(+), 57 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index c8d476e47808b..b4cc4c257edc1 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -2430,19 +2430,23 @@ let Features = "avx512bw,avx512vl",
   def cvtb2mask256 : X86Builtin<"unsigned int(_Vector<32, char>)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512bw,avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cvtmask2b128 : X86Builtin<"_Vector<16, char>(unsigned short)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512bw,avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cvtmask2b256 : X86Builtin<"_Vector<32, char>(unsigned int)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512bw,avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cvtmask2w128 : X86Builtin<"_Vector<8, short>(unsigned char)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512bw,avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cvtmask2w256 : X86Builtin<"_Vector<16, short>(unsigned short)">;
 }
 
@@ -2456,19 +2460,23 @@ let Features = "avx512dq,avx512vl",
   def cvtd2mask256 : X86Builtin<"unsigned char(_Vector<8, int>)">;
 }
 
-let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512dq,avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cvtmask2d128 : X86Builtin<"_Vector<4, int>(unsigned char)">;
 }
 
-let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512dq,avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cvtmask2d256 : X86Builtin<"_Vector<8, int>(unsigned char)">;
 }
 
-let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512dq,avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def cvtmask2q128 : X86Builtin<"_Vector<2, long long int>(unsigned char)">;
 }
 
-let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512dq,avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def cvtmask2q256 : X86Builtin<"_Vector<4, long long int>(unsigned char)">;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index c8f986a55ed3e..57d5f0ae6eed3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3421,6 +3421,30 @@ static bool interp__builtin_ia32_cvt_vec2mask(InterpState &S, CodePtr OpPC,
   pushInteger(S, RetMask, Call->getType());
   return true;
 }
+
+static bool interp__builtin_ia32_cvt_mask2vec(InterpState &S, CodePtr OpPC,
+                                              const CallExpr *Call,
+                                              unsigned ID) {
+  assert(Call->getNumArgs() == 1);
+
+  APSInt Mask = popToAPSInt(S, Call->getArg(0));
+
+  const Pointer &Vec = S.Stk.peek<Pointer>();
+  unsigned NumElems = Vec.getNumElems();
+  PrimType ElemT = Vec.getFieldDesc()->getPrimType();
+
+  for (unsigned I = 0; I != NumElems; ++I) {
+    bool BitSet = Mask[I];
+
+    INT_TYPE_SWITCH_NO_BOOL(
+        ElemT, { Vec.elem<T>(I) = BitSet ? T::from(-1) : T::from(0); });
+  }
+
+  Vec.initializeAllElements();
+
+  return true;
+}
+
 static bool interp__builtin_ia32_cvtsd2ss(InterpState &S, CodePtr OpPC,
                                           const CallExpr *Call,
                                           bool HasRoundingMask) {
@@ -5533,6 +5557,20 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_cvtq2mask512:
     return interp__builtin_ia32_cvt_vec2mask(S, OpPC, Call, BuiltinID);
 
+  case X86::BI__builtin_ia32_cvtmask2b128:
+  case X86::BI__builtin_ia32_cvtmask2b256:
+  case X86::BI__builtin_ia32_cvtmask2b512:
+  case X86::BI__builtin_ia32_cvtmask2w128:
+  case X86::BI__builtin_ia32_cvtmask2w256:
+  case X86::BI__builtin_ia32_cvtmask2w512:
+  case X86::BI__builtin_ia32_cvtmask2d128:
+  case X86::BI__builtin_ia32_cvtmask2d256:
+  case X86::BI__builtin_ia32_cvtmask2d512:
+  case X86::BI__builtin_ia32_cvtmask2q128:
+  case X86::BI__builtin_ia32_cvtmask2q256:
+  case X86::BI__builtin_ia32_cvtmask2q512:
+    return interp__builtin_ia32_cvt_mask2vec(S, OpPC, Call, BuiltinID);
+
   case X86::BI__builtin_ia32_cvtsd2ss:
     return interp__builtin_ia32_cvtsd2ss(S, OpPC, Call, false);
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index f80dabf5444c7..8618979d1eba0 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12388,6 +12388,41 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), RetLen), E);
   }
 
+  case clang::X86::BI__builtin_ia32_cvtmask2b128:
+  case clang::X86::BI__builtin_ia32_cvtmask2b256:
+  case clang::X86::BI__builtin_ia32_cvtmask2b512:
+  case clang::X86::BI__builtin_ia32_cvtmask2w128:
+  case clang::X86::BI__builtin_ia32_cvtmask2w256:
+  case clang::X86::BI__builtin_ia32_cvtmask2w512:
+  case clang::X86::BI__builtin_ia32_cvtmask2d128:
+  case clang::X86::BI__builtin_ia32_cvtmask2d256:
+  case clang::X86::BI__builtin_ia32_cvtmask2d512:
+  case clang::X86::BI__builtin_ia32_cvtmask2q128:
+  case clang::X86::BI__builtin_ia32_cvtmask2q256:
+  case clang::X86::BI__builtin_ia32_cvtmask2q512: {
+    assert(E->getNumArgs() == 1);
+    APSInt Mask;
+    if (!EvaluateInteger(E->getArg(0), Mask, Info))
+      return false;
+
+    QualType VecTy = E->getType();
+    const VectorType *VT = VecTy->castAs<VectorType>();
+    unsigned VectorLen = VT->getNumElements();
+    QualType ElemTy = VT->getElementType();
+    unsigned ElemWidth = Info.Ctx.getTypeSize(ElemTy);
+
+    SmallVector<APValue, 16> Elems;
+    for (unsigned I = 0; I != VectorLen; ++I) {
+      bool BitSet = Mask[I];
+      APSInt ElemVal(ElemWidth, /*isUnsigned=*/false);
+      if (BitSet) {
+        ElemVal.setAllBits();
+      }
+      Elems.push_back(APValue(ElemVal));
+    }
+    return Success(APValue(Elems.data(), VectorLen), E);
+  }
+
   case X86::BI__builtin_ia32_extracti32x4_256_mask:
   case X86::BI__builtin_ia32_extractf32x4_256_mask:
   case X86::BI__builtin_ia32_extracti32x4_mask:
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 48b7c98df7b68..cd4663abe7d9e 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -1760,15 +1760,13 @@ _mm512_movepi16_mask(__m512i __A) {
   return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi8 (__mmask64 __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_movm_epi8(__mmask64 __A) {
   return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi16 (__mmask32 __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_movm_epi16(__mmask32 __A) {
   return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
 }
 
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index ae02cdd47af2e..084ac891821c0 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -1051,15 +1051,13 @@ static __inline__ __mmask16
   return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi32 (__mmask16 __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_movm_epi32(__mmask16 __A) {
   return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi64 (__mmask8 __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_movm_epi64(__mmask8 __A) {
   return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
 }
 
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 9bcb42033f6ef..e03e8689d3f8a 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -5355,9 +5355,8 @@ _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
                                                (__v4sf)_mm_setzero_ps(), \
                                                (__mmask8)(U), (int)(R)))
 
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kmov (__mmask16 __A)
-{
+static __inline__ __mmask16
+    __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_kmov(__mmask16 __A) {
   return  __A;
 }
 
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index a7c1e1c4fc3d2..b66d3961dffc9 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -2500,27 +2500,23 @@ _mm256_movepi16_mask(__m256i __A) {
   return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi8 (__mmask16 __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_movm_epi8(__mmask16 __A) {
   return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi8 (__mmask32 __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_movm_epi8(__mmask32 __A) {
   return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi16 (__mmask8 __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_movm_epi16(__mmask8 __A) {
   return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi16 (__mmask16 __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_movm_epi16(__mmask16 __A) {
   return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
 }
 
diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h
index c956aeb7d03a4..cd1effdec2d62 100644
--- a/clang/lib/Headers/avx512vldqintrin.h
+++ b/clang/lib/Headers/avx512vldqintrin.h
@@ -924,27 +924,23 @@ _mm256_movepi32_mask(__m256i __A) {
   return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi32 (__mmask8 __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_movm_epi32(__mmask8 __A) {
   return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi32 (__mmask8 __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_movm_epi32(__mmask8 __A) {
   return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi64 (__mmask8 __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_movm_epi64(__mmask8 __A) {
   return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi64 (__mmask8 __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_movm_epi64(__mmask8 __A) {
   return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
 }
 
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 7cdec9b4cbbee..96b809cffdd9f 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -2952,16 +2952,15 @@ __m512i test_mm512_movm_epi8(__mmask64 __A) {
   return _mm512_movm_epi8(__A); 
 }
 
-TEST_CONSTEXPR(_mm512_movepi8_mask(
-    ((__m512i)(__v64qi){0, 1, char(129), 3, 4, 5, 6, 7,
-                        8, 9, 10, 11, 12, 13, 14, 15,
-                        16, 17, 18, 19, 20, 21, 22, 23,
-                        24, 25, 26, 27, 28, 29, 30, 31,
-                        32, 33, 34, 35, 36, 37, 38, 39,
-                        40, 41, 42, 43, 44, 45, 46, 47,
-                        48, 49, 50, 51, 52, 53, 54, 55,
-                        56, 57, 58, 59, 60, 61, 62, char(255)})
-) == (__mmask64)0x8000000000000004);
+TEST_CONSTEXPR(match_v64qi(_mm512_movm_epi8(0x8000000000000005),
+    -1, 0, -1, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, -1));
 
 
 __m512i test_mm512_movm_epi16(__mmask32 __A) {
@@ -2971,6 +2970,13 @@ __m512i test_mm512_movm_epi16(__mmask32 __A) {
   return _mm512_movm_epi16(__A); 
 }
 
+TEST_CONSTEXPR(match_v32hi(_mm512_movm_epi16(0x80000007),
+    -1, -1, -1, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, -1));
+
+
 __m512i test_mm512_broadcastb_epi8(__m128i __A) {
   // CHECK-LABEL: test_mm512_broadcastb_epi8
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <64 x i32> zeroinitializer
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index d8647b5547ceb..edbe591b02cbb 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -1381,6 +1381,8 @@ __m512i test_mm512_movm_epi32(__mmask16 __A) {
   return _mm512_movm_epi32(__A); 
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_movm_epi32(0x8005), -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1));
+
 __m512i test_mm512_movm_epi64(__mmask8 __A) {
   // CHECK-LABEL: test_mm512_movm_epi64
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
@@ -1388,6 +1390,9 @@ __m512i test_mm512_movm_epi64(__mmask8 __A) {
   return _mm512_movm_epi64(__A); 
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_movm_epi64(0x85), -1, 0, -1, 0, 0, 0, 0, -1));
+
+
 __mmask8 test_mm512_movepi64_mask(__m512i __A) {
   // CHECK-LABEL: test_mm512_movepi64_mask
   // CHECK: [[CMP:%.*]] = icmp slt <8 x i64> %{{.*}}, zeroinitializer
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 1402ee411029a..f78b28d6da1b1 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -4988,6 +4988,8 @@ __mmask16 test_mm512_kmov(__mmask16 __A) {
   return _mm512_kmov(__A); 
 }
 
+TEST_CONSTEXPR(_mm512_kmov((__mmask16)0x8005) == (__mmask16)0x8005);
+
 __m512d test_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: test_mm512_mask_unpackhi_pd
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index f6f27d9c3da3d..9f70f4639b4ab 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -3219,6 +3219,8 @@ __m128i test_mm_movm_epi8(__mmask16 __A) {
   return _mm_movm_epi8(__A); 
 }
 
+TEST_CONSTEXPR(match_v16qi(_mm_movm_epi8(0x8005),-1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1));
+
 __m256i test_mm256_movm_epi8(__mmask32 __A) {
   // CHECK-LABEL: test_mm256_movm_epi8
   // CHECK: %{{.*}} = bitcast i32 %{{.*}} to <32 x i1>
@@ -3226,6 +3228,12 @@ __m256i test_mm256_movm_epi8(__mmask32 __A) {
   return _mm256_movm_epi8(__A); 
 }
 
+TEST_CONSTEXPR(match_v32qi(_mm256_movm_epi8(0x80000007),
+    -1, -1, -1, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, -1));
+
 __m128i test_mm_movm_epi16(__mmask8 __A) {
   // CHECK-LABEL: test_mm_movm_epi16
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
@@ -3233,6 +3241,8 @@ __m128i test_mm_movm_epi16(__mmask8 __A) {
   return _mm_movm_epi16(__A); 
 }
 
+TEST_CONSTEXPR(match_v8hi(_mm_movm_epi16(0x85), -1, 0, -1, 0, 0, 0, 0, -1));
+
 __m256i test_mm256_movm_epi16(__mmask16 __A) {
   // CHECK-LABEL: test_mm256_movm_epi16
   // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
@@ -3240,6 +3250,9 @@ __m256i test_mm256_movm_epi16(__mmask16 __A) {
   return _mm256_movm_epi16(__A); 
 }
 
+TEST_CONSTEXPR(match_v16hi(_mm256_movm_epi16(0x8005), -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1));
+
+
 __m128i test_mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_broadcastb_epi8
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c
index 92d8e1aa0879a..652fe149db927 100644
--- a/clang/test/CodeGen/X86/avx512vldq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c
@@ -944,6 +944,8 @@ __m128i test_mm_movm_epi32(__mmask8 __A) {
   return _mm_movm_epi32(__A); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_movm_epi32(0x05), -1, 0, -1, 0));
+
 __m256i test_mm256_movm_epi32(__mmask8 __A) {
   // CHECK-LABEL: test_mm256_movm_epi32
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
@@ -951,6 +953,8 @@ __m256i test_mm256_movm_epi32(__mmask8 __A) {
   return _mm256_movm_epi32(__A); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_movm_epi32(0x85), -1, 0, -1, 0, 0, 0, 0, -1));
+
 __m128i test_mm_movm_epi64(__mmask8 __A) {
   // CHECK-LABEL: test_mm_movm_epi64
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
@@ -959,6 +963,9 @@ __m128i test_mm_movm_epi64(__mmask8 __A) {
   return _mm_movm_epi64(__A); 
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_movm_epi64(0x03), -1, -1));
+
+
 __m256i test_mm256_movm_epi64(__mmask8 __A) {
   // CHECK-LABEL: test_mm256_movm_epi64
   // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
@@ -967,6 +974,9 @@ __m256i test_mm256_movm_epi64(__mmask8 __A) {
   return _mm256_movm_epi64(__A); 
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_movm_epi64(0x05), -1, 0, -1, 0));
+
+
 __mmask8 test_mm_movepi64_mask(__m128i __A) {
   // CHECK-LABEL: test_mm_movepi64_mask
   // CHECK: [[CMP:%.*]] = icmp slt <2 x i64> %{{.*}}, zeroinitializer

>From 05a34dde7008f41395312c36c95c4ebc65ff6254 Mon Sep 17 00:00:00 2001
From: Mahesh-Attarde <mahesh.attarde at intel.com>
Date: Sun, 28 Dec 2025 21:29:44 +0530
Subject: [PATCH 29/34] [X86][GISEL] Enable Pre Legalizer Combiner (#173220)

This patch enables Pre-legalization Combiner for X86 Target. It includes
basic bring up with intent to cover non-regressing support from
all_combines.

continuing from https://github.com/llvm/llvm-project/pull/172204.
---
 llvm/lib/Target/X86/CMakeLists.txt            |   3 +
 .../X86/GISel/X86PreLegalizerCombiner.cpp     | 177 ++++++++++++++++++
 llvm/lib/Target/X86/X86.h                     |   2 +
 llvm/lib/Target/X86/X86.td                    |   5 +
 llvm/lib/Target/X86/X86Combine.td             |  20 ++
 llvm/lib/Target/X86/X86TargetMachine.cpp      |   8 +
 .../X86/GlobalISel/stacksave-stackrestore.ll  |   7 +-
 .../switch-bit-test-unreachable-default.ll    |   3 +-
 8 files changed, 219 insertions(+), 6 deletions(-)
 create mode 100644 llvm/lib/Target/X86/GISel/X86PreLegalizerCombiner.cpp
 create mode 100644 llvm/lib/Target/X86/X86Combine.td

diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 434a6d2c3553f..f2880d6c6ea5e 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -19,6 +19,8 @@ tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables -asmwriternum=1)
+tablegen(LLVM X86GenPreLegalizeGICombiner.inc -gen-global-isel-combiner
+              -combiners="X86PreLegalizerCombiner")
 
 add_public_tablegen_target(X86CommonTableGen)
 
@@ -87,6 +89,7 @@ set(sources
   GISel/X86CallLowering.cpp
   GISel/X86InstructionSelector.cpp
   GISel/X86LegalizerInfo.cpp
+  GISel/X86PreLegalizerCombiner.cpp
   GISel/X86RegisterBankInfo.cpp
   )
 
diff --git a/llvm/lib/Target/X86/GISel/X86PreLegalizerCombiner.cpp b/llvm/lib/Target/X86/GISel/X86PreLegalizerCombiner.cpp
new file mode 100644
index 0000000000000..01c3473d16292
--- /dev/null
+++ b/llvm/lib/Target/X86/GISel/X86PreLegalizerCombiner.cpp
@@ -0,0 +1,177 @@
+//===---------------- X86PreLegalizerCombiner.cpp -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass does combining of machine instructions at the generic MI level,
+/// before the legalizer.
+///
+//===----------------------------------------------------------------------===//
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
+#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Instructions.h"
+
+#define GET_GICOMBINER_DEPS
+#include "X86GenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_DEPS
+
+#define DEBUG_TYPE "x86-prelegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+namespace {
+
+#define GET_GICOMBINER_TYPES
+#include "X86GenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_TYPES
+
+class X86PreLegalizerCombinerImpl : public Combiner {
+protected:
+  const CombinerHelper Helper;
+  const X86PreLegalizerCombinerImplRuleConfig &RuleConfig;
+  const X86Subtarget &STI;
+
+public:
+  X86PreLegalizerCombinerImpl(
+      MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+      GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
+      const X86PreLegalizerCombinerImplRuleConfig &RuleConfig,
+      const X86Subtarget &STI, MachineDominatorTree *MDT,
+      const LegalizerInfo *LI);
+
+  static const char *getName() { return "X86PreLegalizerCombiner"; }
+
+  bool tryCombineAll(MachineInstr &I) const override;
+
+  bool tryCombineAllImpl(MachineInstr &I) const;
+
+private:
+#define GET_GICOMBINER_CLASS_MEMBERS
+#include "X86GenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_CLASS_MEMBERS
+};
+
+#define GET_GICOMBINER_IMPL
+#include "X86GenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_IMPL
+
+X86PreLegalizerCombinerImpl::X86PreLegalizerCombinerImpl(
+    MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+    GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
+    const X86PreLegalizerCombinerImplRuleConfig &RuleConfig,
+    const X86Subtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
+    : Combiner(MF, CInfo, TPC, &VT, CSEInfo),
+      Helper(Observer, B, /*IsPreLegalize=*/true, &VT, MDT, LI),
+      RuleConfig(RuleConfig), STI(STI),
+#define GET_GICOMBINER_CONSTRUCTOR_INITS
+#include "X86GenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_CONSTRUCTOR_INITS
+{
+}
+
+bool X86PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
+  return tryCombineAllImpl(MI);
+}
+
+class X86PreLegalizerCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+
+  X86PreLegalizerCombiner();
+
+  StringRef getPassName() const override { return "X86PreLegalizerCombiner"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  X86PreLegalizerCombinerImplRuleConfig RuleConfig;
+};
+} // end anonymous namespace
+
+void X86PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  AU.addRequired<GISelValueTrackingAnalysisLegacy>();
+  AU.addPreserved<GISelValueTrackingAnalysisLegacy>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
+  AU.addRequired<GISelCSEAnalysisWrapperPass>();
+  AU.addPreserved<GISelCSEAnalysisWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+X86PreLegalizerCombiner::X86PreLegalizerCombiner() : MachineFunctionPass(ID) {
+  if (!RuleConfig.parseCommandLineOption())
+    report_fatal_error("Invalid rule identifier");
+}
+
+bool X86PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasFailedISel())
+    return false;
+  auto &TPC = getAnalysis<TargetPassConfig>();
+
+  // Enable CSE.
+  GISelCSEAnalysisWrapper &Wrapper =
+      getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
+  auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig());
+
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  const LegalizerInfo *LI = ST.getLegalizerInfo();
+
+  const Function &F = MF.getFunction();
+  bool EnableOpt =
+      MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
+  GISelValueTracking *VT =
+      &getAnalysis<GISelValueTrackingAnalysisLegacy>().get(MF);
+  MachineDominatorTree *MDT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  CombinerInfo CInfo(/*AllowIllegalOps=*/true, /*ShouldLegalizeIllegal=*/false,
+                     /*LegalizerInfo=*/LI, EnableOpt, F.hasOptSize(),
+                     F.hasMinSize());
+
+  // This is the first Combiner, so the input IR might contain dead
+  // instructions.
+  CInfo.EnableFullDCE = true;
+  X86PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *VT, CSEInfo, RuleConfig,
+                                   ST, MDT, LI);
+  return Impl.combineMachineInstrs();
+}
+
+char X86PreLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(X86PreLegalizerCombiner, DEBUG_TYPE,
+                      "Combine X86 machine instrs before legalization", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy)
+INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
+INITIALIZE_PASS_END(X86PreLegalizerCombiner, DEBUG_TYPE,
+                    "Combine X86 machine instrs before legalization", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createX86PreLegalizerCombiner() {
+  return new X86PreLegalizerCombiner();
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index b949453031dfe..86677ca48860a 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -226,6 +226,7 @@ InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   const X86Subtarget &,
                                                   const X86RegisterBankInfo &);
 
+FunctionPass *createX86PreLegalizerCombiner();
 FunctionPass *createX86LoadValueInjectionLoadHardeningPass();
 FunctionPass *createX86LoadValueInjectionRetHardeningPass();
 FunctionPass *createX86SpeculativeLoadHardeningPass();
@@ -269,6 +270,7 @@ void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
 void initializeX86TileConfigPass(PassRegistry &);
 void initializeX86SuppressAPXForRelocationPassPass(PassRegistry &);
 void initializeX86WinEHUnwindV2Pass(PassRegistry &);
+void initializeX86PreLegalizerCombinerPass(PassRegistry &);
 
 namespace X86AS {
 enum : unsigned {
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 4a3dc17263402..3e357c9000bea 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -2090,3 +2090,8 @@ def X86 : Target {
 //===----------------------------------------------------------------------===//
 
 include "X86PfmCounters.td"
+
+//===----------------------------------------------------------------------===//
+// Global Isel Combiner
+//===----------------------------------------------------------------------===//
+include "X86Combine.td"
diff --git a/llvm/lib/Target/X86/X86Combine.td b/llvm/lib/Target/X86/X86Combine.td
new file mode 100644
index 0000000000000..1c099644a6a0f
--- /dev/null
+++ b/llvm/lib/Target/X86/X86Combine.td
@@ -0,0 +1,20 @@
+//===---------------------- X86Combiner.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/GlobalISel/Combine.td"
+
+// all_x86combines is based on generic all_combines, currently x86 gisel does not
+// have vector support and few open issue to address which resulted in failure with
+// combines. We will introduce more combines gradually.
+
+def all_x86combines : GICombineGroup<[identity_combines, reassocs, 
+    simplify_add_to_sub]>;
+
+def X86PreLegalizerCombiner : GICombiner<"X86PreLegalizerCombinerImpl", [all_x86combines]> {
+    let CombineAllMethodName = "tryCombineAllImpl";
+}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 66341e5ab960e..de0d78814fd83 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -107,6 +107,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
   initializeX86DynAllocaExpanderLegacyPass(PR);
   initializeX86SuppressAPXForRelocationPassPass(PR);
   initializeX86WinEHUnwindV2Pass(PR);
+  initializeX86PreLegalizerCombinerPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -373,6 +374,7 @@ class X86PassConfig : public TargetPassConfig {
   bool addLegalizeMachineIR() override;
   bool addRegBankSelect() override;
   bool addGlobalInstructionSelect() override;
+  void addPreLegalizeMachineIR() override;
   bool addILPOpts() override;
   bool addPreISel() override;
   void addMachineSSAOptimization() override;
@@ -487,6 +489,12 @@ bool X86PassConfig::addGlobalInstructionSelect() {
   return false;
 }
 
+void X86PassConfig::addPreLegalizeMachineIR() {
+  if (getOptLevel() != CodeGenOptLevel::None) {
+    addPass(createX86PreLegalizerCombiner());
+  }
+}
+
 bool X86PassConfig::addILPOpts() {
   addPass(&EarlyIfConverterLegacyID);
   if (EnableMachineCombinerPass)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll
index e86c04ee22dbd..f55706edf1301 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll
@@ -18,10 +18,9 @@ define void @test_scoped_alloca(i64 %n) {
 ; CHECK-NEXT:    .cfi_offset %rbx, -24
 ; CHECK-NEXT:    movq %rsp, %rbx
 ; CHECK-NEXT:    movq %rsp, %rax
-; CHECK-NEXT:    imulq $1, %rdi, %rcx
-; CHECK-NEXT:    addq $15, %rcx
-; CHECK-NEXT:    andq $-16, %rcx
-; CHECK-NEXT:    subq %rcx, %rax
+; CHECK-NEXT:    addq $15, %rdi
+; CHECK-NEXT:    andq $-16, %rdi
+; CHECK-NEXT:    subq %rdi, %rax
 ; CHECK-NEXT:    movq %rax, %rsp
 ; CHECK-NEXT:    movq %rax, %rdi
 ; CHECK-NEXT:    callq use_addr
diff --git a/llvm/test/CodeGen/X86/switch-bit-test-unreachable-default.ll b/llvm/test/CodeGen/X86/switch-bit-test-unreachable-default.ll
index 1a93e38af9f9b..43ca1b1d0bc48 100644
--- a/llvm/test/CodeGen/X86/switch-bit-test-unreachable-default.ll
+++ b/llvm/test/CodeGen/X86/switch-bit-test-unreachable-default.ll
@@ -40,12 +40,11 @@ define i32 @baz(i32 %0) {
 ; CHECK-GISEL:   %0:gr32 = COPY $edi
 ; CHECK-GISEL:   %10:gr32 = MOV32ri 1
 ; CHECK-GISEL:   %11:gr32 = MOV32r0 implicit-def dead $eflags
-; CHECK-GISEL:   %2:gr32 = SUB32ri %0:gr32(tied-def 0), 0, implicit-def dead $eflags
 ; CHECK-GISEL: bb.5 (%ir-block.1):
 ; CHECK-GISEL: ; predecessors: %bb.1
 ; CHECK-GISEL:   successors: %bb.4(0x55555555), %bb.2(0x2aaaaaab); %bb.4(66.67%), %bb.2(33.33%)
 ; CHECK-GISEL:   %3:gr32 = MOV32ri 1
-; CHECK-GISEL:   %13:gr8 = COPY %2.sub_8bit:gr32
+; CHECK-GISEL:   %13:gr8 = COPY %0.sub_8bit:gr32
 ; CHECK-GISEL:   $cl = COPY %13:gr8
 ; CHECK-GISEL:   %4:gr32 = SHL32rCL %3:gr32(tied-def 0), implicit-def dead $eflags, implicit $cl
 ; CHECK-GISEL:   %6:gr32 = AND32ri %4:gr32(tied-def 0), 13056, implicit-def dead $eflags

>From e2dfba68a0c3e1bf2edfef4eab32fbeb46d36cf1 Mon Sep 17 00:00:00 2001
From: Dhruva Narayan K <dhruvakodiadka at gmail.com>
Date: Sun, 28 Dec 2025 22:24:06 +0530
Subject: [PATCH 30/34] [InstCombine] Simplify demanded bits of blendv mask
 operands (#173723)

fixes #173368
- Integer masks: Demands only the sign bit of the operand.
- Float/Double masks: Peeks through bitcasts to demand the sign bit
  from the integer source.
---
 .../Target/X86/X86InstCombineIntrinsic.cpp    | 19 +++++
 .../Transforms/InstCombine/X86/blend_x86.ll   | 69 +++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index ffb8f2ef3643b..cbfaf0f60333f 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2890,7 +2890,26 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
           getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
     }
+    unsigned BitWidth = Mask->getType()->getScalarSizeInBits();
 
+    if (Mask->getType()->isIntOrIntVectorTy()) {
+      KnownBits Known(BitWidth);
+      if (IC.SimplifyDemandedBits(&II, 2, APInt::getSignMask(BitWidth), Known))
+        return &II;
+    } else if (auto *BC = dyn_cast<BitCastInst>(Mask)) {
+      if (BC->hasOneUse()) {
+        Value *Src = BC->getOperand(0);
+        if (Src->getType()->isIntOrIntVectorTy()) {
+          unsigned SrcBitWidth = Src->getType()->getScalarSizeInBits();
+          if (SrcBitWidth == BitWidth) {
+            KnownBits KnownSrc(SrcBitWidth);
+            if (IC.SimplifyDemandedBits(BC, 0, APInt::getSignMask(SrcBitWidth),
+                                        KnownSrc))
+              return &II;
+          }
+        }
+      }
+    }
     Mask = InstCombiner::peekThroughBitcast(Mask);
 
     // Bitshift upto the signbit can always be converted to an efficient
diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
index 0916cf7e708ae..90fa512d306a2 100644
--- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
+++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -357,6 +357,75 @@ define <4 x double> @shl_blendvpd_v4f64(<4 x double> %a0, <4 x double> %a1, <4 x
   ret <4 x double> %r
 }
 
+define <16 x i8> @pblendvb_demanded_msb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %m) {
+; CHECK-LABEL: @pblendvb_demanded_msb(
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[OR:%.*]])
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %or = or <16 x i8> %m, splat (i8 1)
+  %r = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %or)
+  ret <16 x i8> %r
+}
+
+define <8 x float> @blendvps_demanded_msb(<8 x float> %a, <8 x float> %b, <8 x i32> %m) {
+; CHECK-LABEL: @blendvps_demanded_msb(
+; CHECK-NEXT:    [[MASK:%.*]] = bitcast <8 x i32> [[OR:%.*]] to <8 x float>
+; CHECK-NEXT:    [[R:%.*]] = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[MASK]])
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %or = or <8 x i32> %m, splat (i32 1)
+  %mask = bitcast <8 x i32> %or to <8 x float>
+  %r = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %mask)
+  ret <8 x float> %r
+}
+
+define <16 x i8> @pblendvb_or_affects_msb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %m) {
+; CHECK-LABEL: @pblendvb_or_affects_msb(
+; CHECK-NEXT:    ret <16 x i8> [[R:%.*]]
+;
+  %or = or <16 x i8> %m, splat (i8 128)
+  %r = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %or)
+  ret <16 x i8> %r
+}
+
+define <32 x i8> @pblendvb_demanded_msb_avx2(<32 x i8> %a, <32 x i8> %b, <32 x i8> %m) {
+; CHECK-LABEL: @pblendvb_demanded_msb_avx2(
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]], <32 x i8> [[OR:%.*]])
+; CHECK-NEXT:    ret <32 x i8> [[R]]
+;
+  %or = or <32 x i8> %m, splat (i8 1)
+  %r  = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a, <32 x i8> %b, <32 x i8> %or)
+  ret <32 x i8> %r
+}
+
+define <2 x double> @blendvpd_demanded_msb(<2 x double> %a, <2 x double> %b, <2 x i64> %m) {
+; CHECK-LABEL: @blendvpd_demanded_msb(
+; CHECK-NEXT:    [[MASK:%.*]] = bitcast <2 x i64> [[M:%.*]] to <2 x double>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[MASK]])
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
+  %or = or <2 x i64> %m, splat (i64 1)
+  %mask = bitcast <2 x i64> %or to <2 x double>
+  %r = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a, <2 x double> %b, <2 x double> %mask)
+  ret <2 x double> %r
+}
+
+declare void @use_mask(<8 x float>)
+define <8 x float> @blendvps_demanded_msb_multiuse(<8 x float> %a, <8 x float> %b, <8 x i32> %m) {
+; CHECK-LABEL: @blendvps_demanded_msb_multiuse(
+; CHECK-NEXT:    [[OR:%.*]] = or <8 x i32> [[M:%.*]], splat (i32 1)
+; CHECK-NEXT:    [[MASK:%.*]] = bitcast <8 x i32> [[OR]] to <8 x float>
+; CHECK-NEXT:    call void @use_mask(<8 x float> [[MASK]])
+; CHECK-NEXT:    [[R:%.*]] = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[MASK]])
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %or = or <8 x i32> %m, splat (i32 1)
+  %mask = bitcast <8 x i32> %or to <8 x float>
+  call void @use_mask(<8 x float> %mask)
+  %r = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %mask)
+  ret <8 x float> %r
+}
+
 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)

>From e227125e6dbeda993c6fafc6e1719fdbed68cce1 Mon Sep 17 00:00:00 2001
From: Eduardo Tachotte <bfwaend at gmail.com>
Date: Sun, 28 Dec 2025 14:09:59 -0300
Subject: [PATCH 31/34] [libc++][NFC] Fix typo in comment (#173741)

Found a typo while reading the `vector` implementation, this PR simply
fixes it.
---
 libcxx/include/__vector/vector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 93358d863492e..2cb3325807a42 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -114,7 +114,7 @@ class vector {
   using reverse_iterator       = std::reverse_iterator<iterator>;
   using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
-  // A vector containers the following members which may be trivially relocatable:
+  // A vector contains the following members which may be trivially relocatable:
   // - pointer: may be trivially relocatable, so it's checked
   // - allocator_type: may be trivially relocatable, so it's checked
   // vector doesn't contain any self-references, so it's trivially relocatable if its members are.

>From c449f7f1f83b98cdf0eb5c7f829d915138baab44 Mon Sep 17 00:00:00 2001
From: Owen Anderson <resistor at mac.com>
Date: Sun, 28 Dec 2025 11:19:37 -0600
Subject: [PATCH 32/34] [ELF/RISCV] Add definitions for XCHERIOT1 non-standard
 relocations on RISCV. (#172414)

The behavior of these relocations is specified in the CHERIoT
Architecture specification, Version 1.0:
https://github.com/CHERIoT-Platform/cheriot-sail/releases/download/v1.0/cheriot-architecture-v1.0.pdf
---
 .../llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def    | 6 ++++++
 llvm/unittests/Object/ELFTest.cpp                        | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
index 037ca64387339..fb0e29b2aaae5 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
@@ -30,3 +30,9 @@ ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_CALL_PLT, 195)
 // Andes Nonstandard Relocations
 // Calculation: S + A - P (10-bit PC-relative branch offset)
 ELF_RISCV_NONSTANDARD_RELOC(ANDES, R_RISCV_NDS_BRANCH_10,    241)
+
+// CHERIoT Nonstandard Relocations
+ELF_RISCV_NONSTANDARD_RELOC(CHERIOT1, R_RISCV_CHERIOT1_COMPARTMENT_HI, 220)
+ELF_RISCV_NONSTANDARD_RELOC(CHERIOT1, R_RISCV_CHERIOT1_COMPARTMENT_LO_I, 221)
+ELF_RISCV_NONSTANDARD_RELOC(CHERIOT1, R_RISCV_CHERIOT1_COMPARTMENT_LO_S, 222)
+ELF_RISCV_NONSTANDARD_RELOC(CHERIOT1, R_RISCV_CHERIOT1_COMPARTMENT_SIZE, 223)
diff --git a/llvm/unittests/Object/ELFTest.cpp b/llvm/unittests/Object/ELFTest.cpp
index 61ec78fea953a..b0faf073e4ab5 100644
--- a/llvm/unittests/Object/ELFTest.cpp
+++ b/llvm/unittests/Object/ELFTest.cpp
@@ -267,6 +267,15 @@ TEST(ELFTest, getRISCVVendorRelocationTypeName) {
 
   EXPECT_EQ("R_RISCV_NDS_BRANCH_10",
             getRISCVVendorRelocationTypeName(R_RISCV_CUSTOM241, "ANDES"));
+
+  EXPECT_EQ("R_RISCV_CHERIOT1_COMPARTMENT_HI",
+            getRISCVVendorRelocationTypeName(R_RISCV_CUSTOM220, "CHERIOT1"));
+  EXPECT_EQ("R_RISCV_CHERIOT1_COMPARTMENT_LO_I",
+            getRISCVVendorRelocationTypeName(R_RISCV_CUSTOM221, "CHERIOT1"));
+  EXPECT_EQ("R_RISCV_CHERIOT1_COMPARTMENT_LO_S",
+            getRISCVVendorRelocationTypeName(R_RISCV_CUSTOM222, "CHERIOT1"));
+  EXPECT_EQ("R_RISCV_CHERIOT1_COMPARTMENT_SIZE",
+            getRISCVVendorRelocationTypeName(R_RISCV_CUSTOM223, "CHERIOT1"));
 }
 
 TEST(ELFTest, getELFRelativeRelocationType) {

>From 7a3bbf724dc65ca959714bfd66b5e7ebf8c4e091 Mon Sep 17 00:00:00 2001
From: MetalOxideSemi <43286339+MetalOxideSemi at users.noreply.github.com>
Date: Mon, 29 Dec 2025 02:00:46 +0800
Subject: [PATCH 33/34] [SelectionDAG] Fix null pointer dereference in
 resolveDanglingDebugInfo (#173500)

## Summary
Fix null pointer dereference in
`SelectionDAGBuilder::resolveDanglingDebugInfo`.

## Problem
`Val.getNode()->getIROrder()` is called before checking if
`Val.getNode()` is null, causing crashes when compiling code with debug
info that contains aggregate constants with nested empty structs.

## Solution
Move the `ValSDNodeOrder` declaration inside the `if (Val.getNode())`
block.

## Test Case
Reproduces with aggregate types containing nested empty structs:
```llvm
%3 = insertvalue { { i1, {} }, ptr, { { {} }, { {} } }, i64 }
     { { i1, {} } zeroinitializer, ptr null, { { {} }, { {} } } zeroinitializer, i64 2 },
     ptr %2, 1, !dbg !893

## Crash stack
0.      Program arguments: llc-20 -O3 -mcpu=native -relocation-model=pic -filetype=obj /cloudide/workspace/temp/sf.ll -o /dev/null
1.      Running pass 'Function Pass Manager' on module '/cloudide/workspace/temp/sf.ll'.
2.      Running pass 'X86 DAG->DAG Instruction Selection' on function '@filter_create'
Stack dump without symbol names (ensure you have llvm-symbolizer in your PATH or set the environment var `LLVM_SYMBOLIZER_PATH` to point to it):
0  libLLVM.so.20.1 0x00007ff87ebbdf86 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) + 54
1  libLLVM.so.20.1 0x00007ff87ebbbb90 llvm::sys::RunSignalHandlers() + 80
2  libLLVM.so.20.1 0x00007ff87ebbe640
3  libpthread.so.0 0x00007ff87db79140
4  libLLVM.so.20.1 0x00007ff87f3fd2ff llvm::SelectionDAGBuilder::resolveDanglingDebugInfo(llvm::Value const*, llvm::SDValue) + 303
5  libLLVM.so.20.1 0x00007ff87f3fda5e llvm::SelectionDAGBuilder::getValue(llvm::Value const*) + 142
6  libLLVM.so.20.1 0x00007ff87f3fe79f llvm::SelectionDAGBuilder::getValueImpl(llvm::Value const*) + 3343
7  libLLVM.so.20.1 0x00007ff87f3fda34 llvm::SelectionDAGBuilder::getValue(llvm::Value const*) + 100
8  libLLVM.so.20.1 0x00007ff87f3fc1ab llvm::SelectionDAGBuilder::visitInsertValue(llvm::InsertValueInst const&) + 603
9  libLLVM.so.20.1 0x00007ff87f3eeaf7 llvm::SelectionDAGBuilder::visit(llvm::Instruction const&) + 327
10 libLLVM.so.20.1 0x00007ff87f4904b8 llvm::SelectionDAGISel::SelectBasicBlock(llvm::ilist_iterator_w_bits<llvm::ilist_detail::node_options<llvm::Instruction, false, false, void, true, llvm::BasicBlock>, false, true>, llvm::ilist_iterator_w_bits<llvm::ilist_detail::node_options<llvm::Instruction, false, false, void, true, llvm::BasicBlock>, false, true>, bool&) + 72
11 libLLVM.so.20.1 0x00007ff87f490304 llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) + 5956
12 libLLVM.so.20.1 0x00007ff87f48e2b4 llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) + 372
13 libLLVM.so.20.1 0x00007ff87f48c689 llvm::SelectionDAGISelLegacy::runOnMachineFunction(llvm::MachineFunction&) + 169
14 libLLVM.so.20.1 0x00007ff87efb8e32 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) + 610
15 libLLVM.so.20.1 0x00007ff87ed104be llvm::FPPassManager::runOnFunction(llvm::Function&) + 638
16 libLLVM.so.20.1 0x00007ff87ed15ff3 llvm::FPPassManager::runOnModule(llvm::Module&) + 51
17 libLLVM.so.20.1 0x00007ff87ed10c11 llvm::legacy::PassManagerImpl::run(llvm::Module&) + 1105
18 llc-20          0x000055972ce77dc1 main + 9649
19 libc.so.6       0x00007ff87d68ad7a __libc_start_main + 234
20 llc-20          0x000055972ce7247a _start + 42
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  2 +-
 .../selectiondag-dbgvalue-null-crash.ll       | 34 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/Generic/selectiondag-dbgvalue-null-crash.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9e342f9c4416f..e35c0d95f3941 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1504,7 +1504,6 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
   DanglingDebugInfoVector &DDIV = DanglingDbgInfoIt->second;
   for (auto &DDI : DDIV) {
     DebugLoc DL = DDI.getDebugLoc();
-    unsigned ValSDNodeOrder = Val.getNode()->getIROrder();
     unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
     DILocalVariable *Variable = DDI.getVariable();
     DIExpression *Expr = DDI.getExpression();
@@ -1518,6 +1517,7 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
       // in the first place we should not be more successful here). Unless we
       // have some test case that prove this to be correct we should avoid
       // calling EmitFuncArgumentDbgValue here.
+      unsigned ValSDNodeOrder = Val.getNode()->getIROrder();
       if (!EmitFuncArgumentDbgValue(V, Variable, Expr, DL,
                                     FuncArgumentDbgValueKind::Value, Val)) {
         LLVM_DEBUG(dbgs() << "Resolve dangling debug info for "
diff --git a/llvm/test/CodeGen/Generic/selectiondag-dbgvalue-null-crash.ll b/llvm/test/CodeGen/Generic/selectiondag-dbgvalue-null-crash.ll
new file mode 100644
index 0000000000000..3ae8eed1392a6
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/selectiondag-dbgvalue-null-crash.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O3 < %s
+;
+; Regression test for a null pointer dereference in 
+; SelectionDAG::resolveDanglingDebugInfo when Val.getNode() returns null
+; for aggregate types with nested empty structs.
+;
+; The crash occurred when:
+; 1. A dbg_value references an aggregate type containing empty structs {}
+; 2. An insertvalue operation on such types gets lowered by SelectionDAG
+; 3. The resulting SDValue has a null node, causing a crash when accessed
+
+define void @test() !dbg !4 {
+entry:
+  %tmp = alloca { { i1, {} }, ptr, { { {} }, { {} } }, i64 }, align 8
+    #dbg_value({ { {} }, { {} } } zeroinitializer, !5, !DIExpression(), !6)
+    #dbg_value(i64 2, !7, !DIExpression(), !6)
+  %0 = insertvalue { { i1, {} }, ptr, { { {} }, { {} } }, i64 } { { i1, {} } zeroinitializer, ptr null, { { {} }, { {} } } zeroinitializer, i64 2 }, ptr null, 1, !dbg !6
+  %1 = insertvalue { { i1, {} }, ptr, { { {} }, { {} } }, i64 } %0, { i1, {} } zeroinitializer, 0, !dbg !8
+  store { { i1, {} }, ptr, { { {} }, { {} } }, i64 } %1, ptr %tmp, align 8
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "test_selectiondag.cpp", directory: "/home/AnonTokyo/documents/llvm-project/temp")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0)
+!5 = !DILocalVariable(name: "v1", scope: !4, file: !1, line: 2)
+!6 = !DILocation(line: 2, column: 1, scope: !4)
+!7 = !DILocalVariable(name: "v2", scope: !4, file: !1, line: 3)
+!8 = !DILocation(line: 3, column: 1, scope: !4)

>From cd480a2dd0f3552d3e5b0400554fb6d6efbaf554 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me at m-sp.org>
Date: Thu, 25 Dec 2025 12:55:26 +0000
Subject: [PATCH 34/34] [mlir][SCF] Fold unused `index_switch` results

---
 mlir/lib/Dialect/SCF/IR/SCF.cpp         | 52 ++++++++++++++++++++++++-
 mlir/test/Dialect/SCF/canonicalize.mlir | 31 +++++++++++++++
 2 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 4a6b8aa7b1125..46d09abd89d69 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -4797,9 +4797,59 @@ struct FoldConstantCase : OpRewritePattern<scf::IndexSwitchOp> {
   }
 };
 
+/// Canonicalization patterns that folds away dead results of
+/// "scf.index_switch" ops.
+struct FoldUnusedIndexSwitchResults : OpRewritePattern<IndexSwitchOp> {
+  using OpRewritePattern<IndexSwitchOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(IndexSwitchOp op,
+                                PatternRewriter &rewriter) const override {
+    // Find dead results.
+    BitVector deadResults(op.getNumResults(), false);
+    SmallVector<Type> newResultTypes;
+    for (auto [idx, result] : llvm::enumerate(op.getResults())) {
+      if (!result.use_empty()) {
+        newResultTypes.push_back(result.getType());
+      } else {
+        deadResults[idx] = true;
+      }
+    }
+    if (!deadResults.any())
+      return rewriter.notifyMatchFailure(op, "no dead results to fold");
+
+    // Create new op without dead results and inline case regions.
+    auto newOp = IndexSwitchOp::create(rewriter, op.getLoc(), newResultTypes,
+                                       op.getArg(), op.getCases(),
+                                       op.getCaseRegions().size());
+    auto inlineCaseRegion = [&](Region &oldRegion, Region &newRegion) {
+      rewriter.inlineRegionBefore(oldRegion, newRegion, newRegion.begin());
+      // Remove respective operands from yield op.
+      Operation *terminator = newRegion.front().getTerminator();
+      assert(isa<YieldOp>(terminator) && "expected yield op");
+      rewriter.modifyOpInPlace(
+          terminator, [&]() { terminator->eraseOperands(deadResults); });
+    };
+    for (auto [oldRegion, newRegion] :
+         llvm::zip_equal(op.getCaseRegions(), newOp.getCaseRegions()))
+      inlineCaseRegion(oldRegion, newRegion);
+    inlineCaseRegion(op.getDefaultRegion(), newOp.getDefaultRegion());
+
+    // Replace op with new op.
+    SmallVector<Value> newResults(op.getNumResults(), Value());
+    unsigned nextNewResult = 0;
+    for (unsigned idx = 0; idx < op.getNumResults(); ++idx) {
+      if (deadResults[idx])
+        continue;
+      newResults[idx] = newOp.getResult(nextNewResult++);
+    }
+    rewriter.replaceOp(op, newResults);
+    return success();
+  }
+};
+
 void IndexSwitchOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                 MLIRContext *context) {
-  results.add<FoldConstantCase>(context);
+  results.add<FoldConstantCase, FoldUnusedIndexSwitchResults>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 37851710ef010..984ea10f7e540 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -2207,3 +2207,34 @@ func.func @iter_args_cycles_non_cycle_start(%lb : index, %ub : index, %step : in
   }
   return %res#0, %res#1, %res#2 : i32, i32, i32
 }
+
+// -----
+
+// CHECK-LABEL: func @dead_index_switch_result(
+//  CHECK-SAME:     %[[arg0:.*]]: index
+//   CHECK-DAG:   %[[c10:.*]] = arith.constant 10
+//   CHECK-DAG:   %[[c11:.*]] = arith.constant 11
+//       CHECK:   %[[switch:.*]] = scf.index_switch %[[arg0]] -> index
+//       CHECK:   case 1 {
+//       CHECK:     memref.store %[[c10]]
+//       CHECK:     scf.yield %[[arg0]] : index
+//       CHECK:   } 
+//       CHECK:   default {
+//       CHECK:     memref.store %[[c11]]
+//       CHECK:     scf.yield %[[arg0]] : index
+//       CHECK:   }
+//       CHECK:   return %[[switch]]
+func.func @dead_index_switch_result(%arg0 : index, %arg1 : memref<i32>) -> index {
+  %non_live, %live = scf.index_switch %arg0 -> i32, index
+  case 1 {
+    %c10 = arith.constant 10 : i32
+    memref.store %c10, %arg1[] : memref<i32>
+    scf.yield %c10, %arg0 : i32, index
+  }
+  default {
+    %c11 = arith.constant 11 : i32
+    memref.store %c11, %arg1[] : memref<i32>
+    scf.yield %c11, %arg0 : i32, index
+  }
+  return %live : index
+}



More information about the llvm-branch-commits mailing list