[llvm] [SLP]Improve reductions for copyables/split nodes (PR #185697)

Tue Mar 10 10:23:34 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Alexey Bataev (alexey-bataev)

<details>
<summary>Changes</summary>

The original support for copyables leads to a regression in x264 in
RISCV, this patch improves detection of the copyable candidates by more
precise checking of the profitability and adds and extra check for
splitnode reduction, if it is profitable.

Fixes #184313


---

Patch is 40.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/185697.diff


8 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+58-28) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll (+8-9) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll (+8-9) 
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-based-reduction.ll (+79-139) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll (+19-20) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll (+19-19) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll (+2-2) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll (+4-5) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d93a3f25c3292..ae084ba315448 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3707,6 +3707,18 @@ class slpvectorizer::BoUpSLP {
     });
   }
 
+  /// Checks if it is legal and profitable to build SplitVectorize node for the
+  /// given \p VL.
+  /// \param Op1 first homogeneous scalars.
+  /// \param Op2 second homogeneous scalars.
+  /// \param ReorderIndices indices to reorder the scalars.
+  /// \returns true if the node was successfully built.
+  bool canBuildSplitNode(ArrayRef<Value *> VL,
+                         const InstructionsState &LocalState,
+                         SmallVectorImpl<Value *> &Op1,
+                         SmallVectorImpl<Value *> &Op2,
+                         OrdersType &ReorderIndices) const;
+
   ~BoUpSLP();
 
 private:
@@ -3768,18 +3780,6 @@ class slpvectorizer::BoUpSLP {
                                ArrayRef<Value *> VectorizedVals,
                                SmallPtrSetImpl<Value *> &CheckedExtracts);
 
-  /// Checks if it is legal and profitable to build SplitVectorize node for the
-  /// given \p VL.
-  /// \param Op1 first homogeneous scalars.
-  /// \param Op2 second homogeneous scalars.
-  /// \param ReorderIndices indices to reorder the scalars.
-  /// \returns true if the node was successfully built.
-  bool canBuildSplitNode(ArrayRef<Value *> VL,
-                         const InstructionsState &LocalState,
-                         SmallVectorImpl<Value *> &Op1,
-                         SmallVectorImpl<Value *> &Op2,
-                         OrdersType &ReorderIndices) const;
-
   /// This is the recursive part of buildTree.
   void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
                     unsigned InterleaveFactor = 0);
@@ -25910,6 +25910,11 @@ class HorizontalReduction {
     SmallVector<SmallVector<Value *>> LocalReducedVals;
     // Try merge consecutive reduced values into a single vectorizable group and
     // check, if they can be vectorized as copyables.
+    const bool TwoGroupsOnly = ReducedVals.size() == 2;
+    const bool TwoGroupsOfSameSmallSize =
+        TwoGroupsOnly &&
+        ReducedVals.front().size() == ReducedVals.back().size() &&
+        ReducedVals.front().size() < ReductionLimit;
     for (ArrayRef<Value *> RV : ReducedVals) {
       // Loads are not very compatible with undefs.
       if (isa<UndefValue>(RV.front()) &&
@@ -25926,22 +25931,47 @@ class HorizontalReduction {
         States.push_back(getSameOpcode(RV, TLI));
         continue;
       }
-      SmallVector<Value *> Ops;
-      if (!LocalReducedVals.empty())
-        Ops = LocalReducedVals.back();
-      Ops.append(RV.begin(), RV.end());
-      InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
-      InstructionsState OpS =
-          Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
-      if (LocalReducedVals.empty()) {
-        LocalReducedVals.push_back(Ops);
-        States.push_back(OpS);
-        continue;
-      }
-      if (OpS) {
-        LocalReducedVals.back().swap(Ops);
-        States.back() = OpS;
-        continue;
+      // Do some copyables analysis only if more than 2 groups exists or they
+      // are large enough.
+      if (!TwoGroupsOfSameSmallSize) {
+        SmallVector<Value *> Ops;
+        if (!LocalReducedVals.empty())
+          Ops = LocalReducedVals.back();
+        Ops.append(RV.begin(), RV.end());
+        InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
+        InstructionsState OpS = Analysis.buildInstructionsState(
+            Ops, V, /*TryCopyableElementsVectorization=*/true,
+            /*WithProfitabilityCheck=*/true, /*SkipSameCodeCheck=*/true);
+        if (OpS && OpS.areInstructionsWithCopyableElements()) {
+          if (LocalReducedVals.empty()) {
+            LocalReducedVals.push_back(Ops);
+            States.push_back(OpS);
+            continue;
+          }
+          LocalReducedVals.back().swap(Ops);
+          States.back() = OpS;
+          continue;
+        }
+        // For safety, allow split vectorization only if 2 groups are available
+        // overall.
+        if (TwoGroupsOnly) {
+          auto [MainOp, AltOp] = getMainAltOpsNoStateVL(Ops);
+          // Last chance to try to vectorize alternate node.
+          SmallVector<Value *> Op1, Op2;
+          BoUpSLP::OrdersType ReorderIndices;
+          if (MainOp && AltOp &&
+              V.canBuildSplitNode(Ops, InstructionsState(MainOp, AltOp), Op1,
+                                  Op2, ReorderIndices)) {
+            if (LocalReducedVals.empty()) {
+              LocalReducedVals.push_back(Ops);
+              States.push_back(OpS);
+              continue;
+            }
+            LocalReducedVals.back().swap(Ops);
+            States.back() = OpS;
+            continue;
+          }
+        }
       }
       LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
       States.push_back(getSameOpcode(RV, TLI));
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
index a2ccbb96b6003..d36da8d028c60 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
@@ -18,15 +18,14 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w,
 ; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
 ; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
 ; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
-; CHECK-NEXT:    [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]]
-; CHECK-NEXT:    [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]]
-; CHECK-NEXT:    [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]]
-; CHECK-NEXT:    [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]]
-; CHECK-NEXT:    [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]]
-; CHECK-NEXT:    [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]]
-; CHECK-NEXT:    [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]]
-; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
+; CHECK-NEXT:    [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
+; CHECK-NEXT:    [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
+; CHECK-NEXT:    [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
+; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
index eb9b249b9a898..c3131a41c2b2e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
@@ -18,15 +18,14 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w,
 ; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
 ; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
 ; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
-; CHECK-NEXT:    [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]]
-; CHECK-NEXT:    [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]]
-; CHECK-NEXT:    [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]]
-; CHECK-NEXT:    [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]]
-; CHECK-NEXT:    [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]]
-; CHECK-NEXT:    [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]]
-; CHECK-NEXT:    [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]]
-; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
+; CHECK-NEXT:    [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
+; CHECK-NEXT:    [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
+; CHECK-NEXT:    [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
+; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-based-reduction.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-based-reduction.ll
index 78ec0be59e1b4..b8a3ad9e63fda 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-based-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-based-reduction.ll
@@ -7,146 +7,86 @@ define i32 @test(ptr %pix1, i32 %i_pix1, ptr %pix2, i32 %i_pix2) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[I_PIX1]] to i64
 ; CHECK-NEXT:    [[IDX_EXT31:%.*]] = sext i32 [[I_PIX2]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i8> [[TMP1]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i32> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <2 x i8> [[TMP7]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw <2 x i32> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i32> [[TMP11]], [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <2 x i32> [[TMP6]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = shl nsw <2 x i32> [[TMP13]], splat (i32 16)
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <2 x i32> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PIX1]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[ADD_PTR32:%.*]] = getelementptr inbounds i8, ptr [[PIX2]], i64 [[IDX_EXT31]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR32]], align 1
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <2 x i32> [[TMP18]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP24:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
-; CHECK-NEXT:    [[TMP27:%.*]] = sub nsw <2 x i32> [[TMP24]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = add nsw <2 x i32> [[TMP27]], [[TMP22]]
-; CHECK-NEXT:    [[TMP29:%.*]] = sub nsw <2 x i32> [[TMP22]], [[TMP27]]
-; CHECK-NEXT:    [[TMP30:%.*]] = shl nsw <2 x i32> [[TMP29]], splat (i32 16)
-; CHECK-NEXT:    [[TMP31:%.*]] = add nsw <2 x i32> [[TMP28]], [[TMP30]]
-; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[ADD_PTR32_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR32]], i64 [[IDX_EXT31]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP32]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = load <4 x i8>, ptr [[ADD_PTR32_1]], align 1
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP35]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
-; CHECK-NEXT:    [[TMP38:%.*]] = sub nsw <2 x i32> [[TMP34]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <4 x i8> [[TMP32]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP40:%.*]] = zext <2 x i8> [[TMP39]] to <2 x i32>
-; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP35]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP43:%.*]] = sub nsw <2 x i32> [[TMP40]], [[TMP42]]
-; CHECK-NEXT:    [[TMP44:%.*]] = add nsw <2 x i32> [[TMP43]], [[TMP38]]
-; CHECK-NEXT:    [[TMP45:%.*]] = sub nsw <2 x i32> [[TMP38]], [[TMP43]]
-; CHECK-NEXT:    [[TMP46:%.*]] = shl nsw <2 x i32> [[TMP45]], splat (i32 16)
-; CHECK-NEXT:    [[TMP47:%.*]] = add nsw <2 x i32> [[TMP44]], [[TMP46]]
-; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[ADD_PTR32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR32_1]], i64 [[IDX_EXT31]]
-; CHECK-NEXT:    [[TMP48:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP49:%.*]] = shufflevector <4 x i8> [[TMP48]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
-; CHECK-NEXT:    [[TMP51:%.*]] = load <4 x i8>, ptr [[ADD_PTR32_2]], align 1
-; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP51]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP53:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; CHECK-NEXT:    [[TMP54:%.*]] = sub nsw <2 x i32> [[TMP50]], [[TMP53]]
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <4 x i8> [[TMP48]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
-; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <4 x i8> [[TMP51]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP58:%.*]] = zext <2 x i8> [[TMP57]] to <2 x i32>
-; CHECK-NEXT:    [[TMP59:%.*]] = sub nsw <2 x i32> [[TMP56]], [[TMP58]]
-; CHECK-NEXT:    [[TMP60:%.*]] = add nsw <2 x i32> [[TMP59]], [[TMP54]]
-; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <2 x i32> [[TMP54]], [[TMP59]]
-; CHECK-NEXT:    [[TMP62:%.*]] = shl nsw <2 x i32> [[TMP61]], splat (i32 16)
-; CHECK-NEXT:    [[TMP63:%.*]] = add nsw <2 x i32> [[TMP60]], [[TMP62]]
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; CHECK-NEXT:    [[SUB27:%.*]] = sub nsw i32 [[TMP64]], [[TMP65]]
-; CHECK-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP65]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <2 x i32> [[TMP31]], i32 0
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <2 x i32> [[TMP31]], i32 1
-; CHECK-NEXT:    [[SUB27_1:%.*]] = sub nsw i32 [[TMP66]], [[TMP67]]
-; CHECK-NEXT:    [[ADD24_1:%.*]] = add nsw i32 [[TMP67]], [[TMP66]]
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
-; CHECK-NEXT:    [[SUB27_2:%.*]] = sub nsw i32 [[TMP68]], [[TMP69]]
-; CHECK-NEXT:    [[ADD24_2:%.*]] = add nsw i32 [[TMP69]], [[TMP68]]
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <2 x i32> [[TMP63]], i32 0
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP63]], i32 1
-; CHECK-NEXT:    [[SUB27_3:%.*]] = sub nsw i32 [[TMP70]], [[TMP71]]
-; CHECK-NEXT:    [[ADD24_3:%.*]] = add nsw i32 [[TMP71]], [[TMP70]]
-; CHECK-NEXT:    [[ADD45_1:%.*]] = add nsw i32 [[SUB27_1]], [[SUB27]]
-; CHECK-NEXT:    [[ADD45:%.*]] = add nsw i32 [[ADD24_1]], [[ADD24]]
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i32> poison, i32 [[ADD24]], i32 0
-; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <2 x i32> [[TMP72]], i32 [[SUB27]], i32 1
-; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <2 x i32> poison, i32 [[ADD24_1]], i32 0
-; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB27_1]], i32 1
-; CHECK-NEXT:    [[TMP76:%.*]] = sub nsw <2 x i32> [[TMP73]], [[TMP75]]
-; CHECK-NEXT:    [[ADD59_1:%.*]] = add nsw i32 [[SUB27_3]], [[SUB27_2]]
-; CHECK-NEXT:    [[ADD59:%.*]] = add nsw i32 [[ADD24_3]], [[ADD24_2]]
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <2 x i32> poison, i32 [[ADD24_2]], i32 0
-; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <2 x i32> [[TMP77]], i32 [[SUB27_2]], i32 1
-; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <2 x i32> poison, i32 [[ADD24_3]], i32 0
-; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x i32> [[TMP79]], i32 [[SUB27_3]], i32 1
-; CHECK-NEXT:    [[TMP81:%.*]] = sub nsw <2 x i32> [[TMP78]], [[TMP80]]
-; CHECK-NEXT:    [[ADD67_1:%.*]] = add nsw i32 [[ADD59_1]], [[ADD45_1]]
-; CHECK-NEXT:    [[ADD67:%.*]] = add nsw i32 [[ADD59]], [[ADD45]]
-; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> poison, i32 [[ADD45]], i32 0
-; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <2 x i32> [[TMP82]], i32 [[ADD45_1]], i32 1
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <2 x i32> poison, i32 [[ADD59]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <2 x i32> [[TMP84]], i32 [[ADD59_1]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = sub nsw <2 x i32> [[TMP83]], [[TMP85]]
-; CHECK-NEXT:    [[TMP87:%.*]] = add nsw <2 x i32> [[TMP81]], [[TMP76]]
-; CHECK-NEXT:    [[TMP88:%.*]] = sub nsw <2 x i32> [[TMP76]], [[TMP81]]
-; CHECK-NEXT:    [[TMP89:%.*]] = insertelement <2 x i32> poison, i32 [[ADD67]], i32 0
-; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <2 x i32> [[TMP89]], i32 [[ADD67_1]], i32 1
-; CHECK-NEXT:    [[TMP91:%.*]] = lshr <2 x i32> [[TMP90]], splat (i32 15)
-; CHECK-NEXT:    [[TMP92:%.*]] = and <2 x i32> [[TMP91]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP93:%.*]] = mul nuw <2 x i32> [[TMP92]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP94:%.*]] = add <2 x i32> [[TMP93]], [[TMP90]]
-; CHECK-NEXT:    [[TMP95:%.*]] = xor <2 x i32> [[TMP94]], [[TMP93]]
-; CHECK-NEXT:    [[TMP96:%.*]] = lshr <2 x i32> [[TMP87]], splat (i32 15)
-; CHECK-NEXT:    [[TMP97:%.*]] = and <2 x i32> [[TMP96]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP98:%.*]] = mul nuw <2 x i32> [[TMP97]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP99:%.*]] = add <2 x i32> [[TMP98]], [[TMP87]]
-; CHECK-NEXT:    [[TMP100:%.*]] = xor <2 x i32> [[TMP99]], [[TMP98]]
-; CHECK-NEXT:    [[TMP101:%.*]] = add <2 x i32> [[TMP95]], [[TMP100]]
-; CHECK-NEXT:    [[TMP102:%.*]] = lshr <2 x i32> [[TMP86]], splat (i32 15)
-; CHECK-NEXT:    [[TMP103:%.*]] = and <2 x i32> [[TMP102]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP104:%.*]] = mul nuw <2 x i32> [[TMP103]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP104]], [[TMP86]]
-; CHECK-NEXT:    [[TMP106:%.*]] = xor <2 x i32> [[TMP105]], [[TMP104]]
-; CHECK-NEXT:    [[TMP107:%.*]] = add <2 x i32> [[TMP101]], [[TMP106]]
-; CHECK-NEXT:    [[TMP108:%.*]] = lshr <2 x i32> [[TMP88]], splat (i32 15)
-; CHECK-NEXT:    [[TMP109:%.*]] = and <2 x i32> [[TMP108]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP110:%.*]] = mul nuw <2 x i32> [[TMP109]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP111:%.*]] = add <2 x i32> [[TMP110]], [[TMP88]]
-; CHECK-NEXT: ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/185697