[llvm] [SLP]Try to keep operand of external casts as scalars, if profitable (PR #110537)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 30 10:13:06 PDT 2024


https://github.com/alexey-bataev created https://github.com/llvm/llvm-project/pull/110537

If the cost of original scalar instruction + cast is better than the
extractelement from the vector cast instruction, better to keep original
scalar instructions, where possible


>From 2531a1ab752c0c64c3b970352d888e369d30fbdc Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 30 Sep 2024 17:12:52 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  27 +-
 .../SLPVectorizer/RISCV/complex-loads.ll      | 403 +++++++++---------
 .../insertelement-uses-vectorized-index.ll    |   3 +-
 3 files changed, 221 insertions(+), 212 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e45fcb2b5c790c..72fa37bd849bd9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11563,6 +11563,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
   std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
   DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
+  SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
   for (ExternalUser &EU : ExternalUses) {
     // Uses by ephemeral values are free (because the ephemeral value will be
     // removed prior to code generation, and so the extraction will be
@@ -11698,7 +11699,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
       // Can use original instruction, if no operands vectorized or they are
       // marked as externally used already.
       auto *Inst = cast<Instruction>(EU.Scalar);
-      bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
+      InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
+      auto OperandIsScalar = [&](Value *V) {
         if (!getTreeEntry(V)) {
           // Some extractelements might be not vectorized, but
           // transformed into shuffle and removed from the function,
@@ -11708,9 +11710,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           return true;
         }
         return ValueToExtUses->contains(V);
-      });
+      };
+      bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
+      bool CanBeUsedAsScalarCast = false;
+      if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
+        if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
+            Op && all_of(Op->operands(), OperandIsScalar)) {
+          InstructionCost OpCost = TTI->getInstructionCost(Op, CostKind);
+          if (ScalarCost + OpCost <= ExtraCost) {
+            CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
+            ScalarCost += OpCost;
+          }
+        }
+      }
       if (CanBeUsedAsScalar) {
-        InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
         bool KeepScalar = ScalarCost <= ExtraCost;
         // Try to keep original scalar if the user is the phi node from the same
         // block as the root phis, currently vectorized. It allows to keep
@@ -11766,12 +11779,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           ExtraCost = ScalarCost;
           if (!IsPhiInLoop(EU))
             ExtractsCount[Entry].insert(Inst);
+          if (CanBeUsedAsScalarCast)
+            ScalarOpsFromCasts.insert(Inst->getOperand(0));
         }
       }
     }
 
     ExtractCost += ExtraCost;
   }
+  // Insert externals for extract of operands of casts to be emitted as scalars
+  // instead of extractelement.
+  for (Value *V : ScalarOpsFromCasts) {
+    ExternalUsesAsOriginalScalar.insert(V);
+    ExternalUses.emplace_back(V, nullptr, getTreeEntry(V)->findLaneForValue(V));
+  }
   // Add reduced value cost, if resized.
   if (!VectorizedVals.empty()) {
     const TreeEntry &Root = *VectorizableTree.front();
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 803af4d166b213..b38c636ccaf5da 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -54,38 +54,43 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP192:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP192]], [[TMP22]]
-; CHECK-NEXT:    [[TMP29:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP40:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP40]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; CHECK-NEXT:    [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP42:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32>
+; CHECK-NEXT:    [[TMP27:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP49]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
-; CHECK-NEXT:    [[TMP31:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
-; CHECK-NEXT:    [[TMP33:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
-; CHECK-NEXT:    [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP31:%.*]] = add <2 x i32> [[TMP25]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; CHECK-NEXT:    [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = sub <2 x i32> [[TMP50]], [[TMP51]]
+; CHECK-NEXT:    [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP51]], [[TMP39]]
+; CHECK-NEXT:    [[TMP40:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP56:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT:    [[TMP56:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
-; CHECK-NEXT:    [[TMP60:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP56]], i32 1
+; CHECK-NEXT:    [[TMP59:%.*]] = add <2 x i32> [[TMP37]], [[TMP52]]
+; CHECK-NEXT:    [[TMP63:%.*]] = add <2 x i32> [[TMP59]], [[TMP31]]
+; CHECK-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP31]], [[TMP59]]
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP63]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP63]], i32 1
 ; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
 ; CHECK-NEXT:    [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP60]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
-; CHECK-NEXT:    [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
@@ -94,52 +99,55 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP52:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT:    [[TMP76:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT:    [[TMP77:%.*]] = zext i8 [[TMP76]] to i32
 ; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = sub <2 x i32> [[TMP52]], [[TMP61]]
+; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT:    [[TMP58:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP55]]
-; CHECK-NEXT:    [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT:    [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; CHECK-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP74]], [[TMP82]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add <2 x i32> [[TMP46]], [[TMP58]]
+; CHECK-NEXT:    [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT:    [[TMP68:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
+; CHECK-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP94]], [[TMP103]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
 ; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT:    [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP72:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP94]], [[TMP72]]
+; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
+; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP108]], [[TMP75]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP75:%.*]] = add <2 x i32> [[TMP70]], [[TMP85]]
-; CHECK-NEXT:    [[TMP76:%.*]] = add <2 x i32> [[TMP75]], [[TMP62]]
-; CHECK-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP62]], [[TMP75]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; CHECK-NEXT:    [[TMP109:%.*]] = add <2 x i32> [[TMP70]], [[TMP106]]
+; CHECK-NEXT:    [[TMP79:%.*]] = add <2 x i32> [[TMP109]], [[TMP82]]
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP82]], [[TMP109]]
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
 ; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
 ; CHECK-NEXT:    [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP115]], i32 0
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP115]], i32 1
 ; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i32> [[TMP52]], i32 0
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP80]], [[TMP81]]
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
 ; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
 ; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15
 ; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
 ; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
 ; CHECK-NEXT:    [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15
 ; CHECK-NEXT:    [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
 ; CHECK-NEXT:    [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535
-; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0
 ; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
 ; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
 ; CHECK-NEXT:    [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
@@ -147,60 +155,62 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[TMP107]], [[TMP68]]
-; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP68]], [[TMP107]]
+; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
 ; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
 ; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
 ; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
 ; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP109]]
+; CHECK-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP117]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT:    [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT:    [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP121]], [[TMP100]]
+; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT:    [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; CHECK-NEXT:    [[TMP132:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP132]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP131]], [[TMP100]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP132:%.*]] = sub <2 x i32> [[TMP97]], [[TMP112]]
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP132]]
+; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP97]], [[TMP121]]
+; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP133]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP133:%.*]] = sub <2 x i32> [[TMP86]], [[TMP108]]
-; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP133]]
+; CHECK-NEXT:    [[TMP107:%.*]] = sub <2 x i32> [[TMP86]], [[TMP112]]
+; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP107]]
 ; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
+; CHECK-NEXT:    [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
 ; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
+; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP111]]
 ; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15
 ; CHECK-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
 ; CHECK-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
 ; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP114:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT:    [[TMP145:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP118:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP117]], [[TMP119]]
+; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP145]], [[TMP120]]
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <2 x i32> [[TMP122]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP134:%.*]] = sub <2 x i32> [[TMP123]], [[TMP115]]
-; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP134]]
+; CHECK-NEXT:    [[TMP157:%.*]] = sub <2 x i32> [[TMP123]], [[TMP134]]
+; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP157]]
 ; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[TMP127:%.*]] = insertelement <2 x i8> [[TMP126]], i8 [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT:    [[TMP158:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP128]]
+; CHECK-NEXT:    [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP158]]
 ; CHECK-NEXT:    [[TMP135:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP16]], i32 1
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
@@ -214,23 +224,25 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB47_1:%.*]] = sub i32 [[TMP138]], [[TMP171]]
 ; CHECK-NEXT:    [[TMP140:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP105]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP157:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
+; CHECK-NEXT:    [[TMP163:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
 ; CHECK-NEXT:    [[TMP143:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
 ; CHECK-NEXT:    [[TMP144:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP145:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP145]], i32 1
+; CHECK-NEXT:    [[TMP165:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
+; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
+; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP146]], [[TMP98]]
 ; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15
 ; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT:    [[TMP164:%.*]] = add <2 x i32> [[TMP145]], [[TMP157]]
-; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP157]], [[TMP145]]
-; CHECK-NEXT:    [[TMP165:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
+; CHECK-NEXT:    [[TMP167:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
+; CHECK-NEXT:    [[TMP166:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP166]], [[TMP167]]
+; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
+; CHECK-NEXT:    [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
 ; CHECK-NEXT:    [[TMP151:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> [[TMP151]], i32 [[SUB45_1]], i32 0
-; CHECK-NEXT:    [[TMP180:%.*]] = add <2 x i32> [[TMP165]], [[TMP152]]
-; CHECK-NEXT:    [[TMP154:%.*]] = sub <2 x i32> [[TMP152]], [[TMP165]]
-; CHECK-NEXT:    [[TMP166:%.*]] = extractelement <2 x i32> [[TMP145]], i32 0
+; CHECK-NEXT:    [[TMP154:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
+; CHECK-NEXT:    [[TMP168:%.*]] = sub <2 x i32> [[TMP152]], [[TMP154]]
 ; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP166]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
@@ -240,74 +252,48 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP147:%.*]] = lshr <2 x i32> [[TMP113]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP148:%.*]] = and <2 x i32> [[TMP147]], <i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP149:%.*]] = mul <2 x i32> [[TMP148]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP167:%.*]] = shufflevector <2 x i32> [[TMP164]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <2 x i32> [[TMP167]], i32 [[ADD48_3]], i32 0
-; CHECK-NEXT:    [[TMP161:%.*]] = insertelement <2 x i32> [[TMP164]], i32 [[ADD48_2]], i32 0
-; CHECK-NEXT:    [[TMP162:%.*]] = add <2 x i32> [[TMP160]], [[TMP161]]
-; CHECK-NEXT:    [[TMP163:%.*]] = sub <2 x i32> [[TMP161]], [[TMP160]]
-; CHECK-NEXT:    [[ADD95:%.*]] = extractelement <2 x i32> [[TMP162]], i32 0
-; CHECK-NEXT:    [[ADD79:%.*]] = extractelement <2 x i32> [[TMP162]], i32 1
-; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD79]]
-; CHECK-NEXT:    [[SUB105:%.*]] = sub i32 [[ADD79]], [[ADD95]]
-; CHECK-NEXT:    [[ADD94:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
-; CHECK-NEXT:    [[ADD78:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[ADD94]], [[ADD78]]
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
+; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
+; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
+; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
 ; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]]
-; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB105]]
+; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
 ; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP166]]
-; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]]
+; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I56_1]], [[SUB106]]
 ; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP146]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; CHECK-NEXT:    [[TMP168:%.*]] = extractelement <2 x i32> [[TMP180]], i32 0
-; CHECK-NEXT:    [[TMP169:%.*]] = extractelement <2 x i32> [[TMP180]], i32 1
-; CHECK-NEXT:    [[SUB102:%.*]] = add i32 [[TMP168]], [[TMP169]]
-; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
-; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <2 x i32> [[TMP180]], i32 [[ADD55_2]], i32 0
-; CHECK-NEXT:    [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP180]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> [[TMP173]], i32 [[ADD55_3]], i32 0
-; CHECK-NEXT:    [[TMP175:%.*]] = sub <2 x i32> [[TMP170]], [[TMP172]]
-; CHECK-NEXT:    [[TMP174:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
-; CHECK-NEXT:    [[TMP159:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
-; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP174]], [[TMP159]]
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
-; CHECK-NEXT:    [[TMP244:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT:    [[TMP177:%.*]] = insertelement <2 x i32> [[TMP244]], i32 [[ADD55_3]], i32 1
-; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
-; CHECK-NEXT:    [[TMP179:%.*]] = insertelement <2 x i32> [[TMP197]], i32 [[ADD55_2]], i32 1
-; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP177]], [[TMP179]]
-; CHECK-NEXT:    [[SUB104_1:%.*]] = sub i32 [[SUB102]], [[ADD94_1]]
-; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[TMP159]], [[TMP174]]
-; CHECK-NEXT:    [[TMP193:%.*]] = insertelement <2 x i32> [[TMP192]], i32 [[SUB102]], i32 1
-; CHECK-NEXT:    [[TMP182:%.*]] = xor <2 x i32> [[TMP181]], [[TMP193]]
-; CHECK-NEXT:    [[TMP183:%.*]] = add <2 x i32> [[TMP181]], [[TMP193]]
-; CHECK-NEXT:    [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP182]], <2 x i32> [[TMP183]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP185:%.*]] = insertelement <2 x i32> poison, i32 [[ADD113]], i32 0
-; CHECK-NEXT:    [[TMP186:%.*]] = insertelement <2 x i32> [[TMP185]], i32 [[MUL_I_2]], i32 1
-; CHECK-NEXT:    [[TMP202:%.*]] = add <2 x i32> [[TMP184]], [[TMP186]]
-; CHECK-NEXT:    [[TMP203:%.*]] = extractelement <2 x i32> [[TMP202]], i32 1
-; CHECK-NEXT:    [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP32]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
+; CHECK-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]]
+; CHECK-NEXT:    [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
+; CHECK-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
+; CHECK-NEXT:    [[TMP203:%.*]] = add i32 [[MUL_I_2]], [[ADD103_1]]
 ; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[TMP203]], [[TMP83]]
+; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]]
+; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]]
 ; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_2]], [[SUB104_1]]
 ; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
 ; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
 ; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]]
-; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP202]], i32 0
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[TMP190]], [[XOR_I_1]]
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
 ; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
 ; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP191]], [[TMP205]]
+; CHECK-NEXT:    [[TMP169:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP169]], [[TMP160]]
 ; CHECK-NEXT:    [[TMP196:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB51_2]], i32 0
 ; CHECK-NEXT:    [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP141]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> [[TMP194]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT:    [[TMP206:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
+; CHECK-NEXT:    [[TMP164:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
 ; CHECK-NEXT:    [[TMP201:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
 ; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP201]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_6]], i32 0
@@ -315,8 +301,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP225:%.*]] = add <2 x i32> [[TMP198]], [[TMP200]]
 ; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP198]], [[TMP200]]
 ; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP204:%.*]] = extractelement <2 x i32> [[TMP206]], i32 0
-; CHECK-NEXT:    [[TMP212:%.*]] = extractelement <2 x i32> [[TMP206]], i32 1
+; CHECK-NEXT:    [[TMP204:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0
+; CHECK-NEXT:    [[TMP212:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1
 ; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP204]], [[TMP212]]
 ; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP212]], [[TMP204]]
 ; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
@@ -329,13 +315,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP98]]
 ; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP208:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP208]]
-; CHECK-NEXT:    [[TMP209:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP209]]
+; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP176]]
+; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP177]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP154]], i32 0
-; CHECK-NEXT:    [[SUB59:%.*]] = extractelement <2 x i32> [[TMP154]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0
+; CHECK-NEXT:    [[SUB59:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1
 ; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
 ; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
 ; CHECK-NEXT:    [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -360,10 +346,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP228:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP228]]
-; CHECK-NEXT:    [[TMP229:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
-; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP229]]
+; CHECK-NEXT:    [[TMP192:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP192]]
+; CHECK-NEXT:    [[TMP193:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP193]]
 ; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
@@ -422,35 +408,35 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP66]], [[TMP22]]
 ; THR15-NEXT:    [[TMP24:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; THR15-NEXT:    [[TMP28:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
-; THR15-NEXT:    [[TMP30:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT:    [[TMP47:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
-; THR15-NEXT:    [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP47]]
+; THR15-NEXT:    [[TMP29:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; THR15-NEXT:    [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
+; THR15-NEXT:    [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP30]]
 ; THR15-NEXT:    [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP15:%.*]] = add <2 x i32> [[TMP14]], [[TMP23]]
 ; THR15-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
 ; THR15-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
 ; THR15-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
 ; THR15-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
-; THR15-NEXT:    [[TMP49:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; THR15-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
+; THR15-NEXT:    [[TMP31:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
+; THR15-NEXT:    [[TMP47:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
 ; THR15-NEXT:    [[TMP33:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; THR15-NEXT:    [[TMP78:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; THR15-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP59]], [[TMP78]]
+; THR15-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
+; THR15-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP47]], [[TMP50]]
 ; THR15-NEXT:    [[TMP36:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
-; THR15-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
+; THR15-NEXT:    [[TMP53:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
 ; THR15-NEXT:    [[TMP38:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
 ; THR15-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; THR15-NEXT:    [[TMP25:%.*]] = sub <2 x i32> [[TMP80]], [[TMP39]]
+; THR15-NEXT:    [[TMP25:%.*]] = sub <2 x i32> [[TMP53]], [[TMP39]]
 ; THR15-NEXT:    [[TMP26:%.*]] = shl <2 x i32> [[TMP25]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP27:%.*]] = add <2 x i32> [[TMP26]], [[TMP35]]
-; THR15-NEXT:    [[TMP83:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
-; THR15-NEXT:    [[TMP29:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; THR15-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP83]]
-; THR15-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP83]], [[TMP29]]
-; THR15-NEXT:    [[TMP87:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
-; THR15-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
-; THR15-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP87]]
-; THR15-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP87]], [[TMP31]]
+; THR15-NEXT:    [[TMP68:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; THR15-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
+; THR15-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP59]], [[TMP68]]
+; THR15-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP68]], [[TMP59]]
+; THR15-NEXT:    [[TMP76:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; THR15-NEXT:    [[TMP60:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
+; THR15-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP60]], [[TMP76]]
+; THR15-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP76]], [[TMP60]]
 ; THR15-NEXT:    [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
 ; THR15-NEXT:    [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[ADD46_2]]
 ; THR15-NEXT:    [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
@@ -458,15 +444,17 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; THR15-NEXT:    [[TMP32:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT:    [[TMP48:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; THR15-NEXT:    [[TMP48:%.*]] = load i8, ptr null, align 1
+; THR15-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; THR15-NEXT:    [[TMP63:%.*]] = zext i8 [[TMP48]] to i32
 ; THR15-NEXT:    [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
-; THR15-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP48]], [[TMP50]]
+; THR15-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
+; THR15-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP49]], [[TMP61]]
 ; THR15-NEXT:    [[TMP37:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP53:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
-; THR15-NEXT:    [[TMP54:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; THR15-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; THR15-NEXT:    [[TMP41:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]]
+; THR15-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
+; THR15-NEXT:    [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; THR15-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
+; THR15-NEXT:    [[TMP41:%.*]] = sub <2 x i32> [[TMP54]], [[TMP80]]
 ; THR15-NEXT:    [[TMP42:%.*]] = shl <2 x i32> [[TMP41]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP93]]
 ; THR15-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
@@ -474,34 +462,33 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP44:%.*]] = load i8, ptr null, align 1
 ; THR15-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
 ; THR15-NEXT:    [[TMP45:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT:    [[TMP61:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
-; THR15-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; THR15-NEXT:    [[TMP99:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
-; THR15-NEXT:    [[TMP101:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
-; THR15-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP98]], [[TMP101]]
+; THR15-NEXT:    [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
+; THR15-NEXT:    [[TMP83:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
+; THR15-NEXT:    [[TMP87:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
+; THR15-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP87]] to <2 x i32>
+; THR15-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP83]], [[TMP98]]
 ; THR15-NEXT:    [[TMP51:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
 ; THR15-NEXT:    [[TMP52:%.*]] = insertelement <2 x i8> [[TMP51]], i8 [[TMP45]], i32 1
-; THR15-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; THR15-NEXT:    [[TMP102:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; THR15-NEXT:    [[TMP70:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
-; THR15-NEXT:    [[TMP56:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
+; THR15-NEXT:    [[TMP99:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
+; THR15-NEXT:    [[TMP70:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
+; THR15-NEXT:    [[TMP101:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32>
+; THR15-NEXT:    [[TMP56:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
 ; THR15-NEXT:    [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP65]]
 ; THR15-NEXT:    [[TMP104:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; THR15-NEXT:    [[TMP60:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; THR15-NEXT:    [[ADD44_3:%.*]] = add i32 [[TMP60]], [[TMP104]]
-; THR15-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP104]], [[TMP60]]
-; THR15-NEXT:    [[TMP76:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
-; THR15-NEXT:    [[TMP62:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
-; THR15-NEXT:    [[ADD46_3:%.*]] = add i32 [[TMP62]], [[TMP76]]
-; THR15-NEXT:    [[SUB47_3:%.*]] = sub i32 [[TMP76]], [[TMP62]]
+; THR15-NEXT:    [[TMP102:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
+; THR15-NEXT:    [[ADD44_3:%.*]] = add i32 [[TMP102]], [[TMP104]]
+; THR15-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP104]], [[TMP102]]
+; THR15-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
+; THR15-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
+; THR15-NEXT:    [[ADD46_3:%.*]] = add i32 [[TMP78]], [[TMP107]]
+; THR15-NEXT:    [[SUB47_3:%.*]] = sub i32 [[TMP107]], [[TMP78]]
 ; THR15-NEXT:    [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]]
 ; THR15-NEXT:    [[SUB51_3:%.*]] = sub i32 [[ADD44_3]], [[ADD46_3]]
 ; THR15-NEXT:    [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
 ; THR15-NEXT:    [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]]
 ; THR15-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; THR15-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; THR15-NEXT:    [[TMP63:%.*]] = extractelement <2 x i32> [[TMP48]], i32 0
 ; THR15-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP63]], 15
 ; THR15-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; THR15-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
@@ -531,15 +518,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP81:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; THR15-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
 ; THR15-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; THR15-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
 ; THR15-NEXT:    [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
+; THR15-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
 ; THR15-NEXT:    [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; THR15-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP108]], [[TMP109]]
+; THR15-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
+; THR15-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP109]], [[TMP111]]
 ; THR15-NEXT:    [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; THR15-NEXT:    [[TMP125:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
 ; THR15-NEXT:    [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; THR15-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
 ; THR15-NEXT:    [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
@@ -547,10 +534,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP84:%.*]] = sub <2 x i32> [[TMP94]], [[TMP96]]
 ; THR15-NEXT:    [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT:    [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP111]]
+; THR15-NEXT:    [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP125]]
 ; THR15-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP100]]
 ; THR15-NEXT:    [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT:    [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP107]]
+; THR15-NEXT:    [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP108]]
 ; THR15-NEXT:    [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP120]]
 ; THR15-NEXT:    [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
@@ -575,8 +562,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; THR15-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
 ; THR15-NEXT:    [[TMP131:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT:    [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; THR15-NEXT:    [[TMP113:%.*]] = sub <2 x i32> [[TMP117]], [[TMP132]]
+; THR15-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
+; THR15-NEXT:    [[TMP113:%.*]] = sub <2 x i32> [[TMP117]], [[TMP119]]
 ; THR15-NEXT:    [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; THR15-NEXT:    [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
@@ -589,11 +576,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP128]]
 ; THR15-NEXT:    [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
 ; THR15-NEXT:    [[SHL30_1:%.*]] = shl i32 [[TMP118]], 16
-; THR15-NEXT:    [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
-; THR15-NEXT:    [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP119]]
+; THR15-NEXT:    [[TMP132:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; THR15-NEXT:    [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP132]]
 ; THR15-NEXT:    [[TMP133:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
-; THR15-NEXT:    [[TMP125:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
-; THR15-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP133]], [[TMP125]]
+; THR15-NEXT:    [[TMP147:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
+; THR15-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP133]], [[TMP147]]
 ; THR15-NEXT:    [[TMP135:%.*]] = shufflevector <2 x i32> [[TMP121]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
 ; THR15-NEXT:    [[TMP136:%.*]] = insertelement <2 x i32> [[TMP135]], i32 [[ADD43_1]], i32 1
 ; THR15-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP121]], i32 [[ADD31_1]], i32 1
@@ -679,9 +666,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP162]]
 ; THR15-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
 ; THR15-NEXT:    [[TMP159:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
-; THR15-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP144]], i32 1
-; THR15-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP159]], [[TMP178]]
-; THR15-NEXT:    [[SUB86_3:%.*]] = sub i32 [[TMP178]], [[TMP159]]
+; THR15-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP144]], i32 1
+; THR15-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP159]], [[TMP176]]
+; THR15-NEXT:    [[SUB86_3:%.*]] = sub i32 [[TMP176]], [[TMP159]]
 ; THR15-NEXT:    [[TMP163:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
 ; THR15-NEXT:    [[TMP164:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; THR15-NEXT:    [[TMP165:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -706,8 +693,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
 ; THR15-NEXT:    [[TMP175:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
 ; THR15-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP175]]
-; THR15-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
-; THR15-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP176]]
+; THR15-NEXT:    [[TMP178:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
+; THR15-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP178]]
 ; THR15-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; THR15-NEXT:    ret i32 [[ADD113_3]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll
index 94f973e606436e..0734fd80709f7a 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll
@@ -8,6 +8,8 @@ define void @test(ptr %0) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <2 x ptr> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP8]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr null to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
 ; CHECK-NEXT:    switch i32 0, label %[[NEWFUNCROOT994:.*]] [
@@ -18,7 +20,6 @@ define void @test(ptr %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[NEWFUNCROOT994]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[TMP5]], i64 [[TMP6]]
 ; CHECK-NEXT:    ret void
 ;



More information about the llvm-commits mailing list