[llvm] [SLP]Try to keep operand of external casts as scalars, if profitable (PR #110537)

Mon Sep 30 10:13:43 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

<details>
<summary>Changes</summary>

If the cost of original scalar instruction + cast is better than the
extractelement from the vector cast instruction, better to keep original
scalar instructions, where possible


---

Patch is 54.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110537.diff


3 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+24-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll (+195-208) 
- (modified) llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll (+2-1) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e45fcb2b5c790c..72fa37bd849bd9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11563,6 +11563,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
   std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
   DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
+  SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
   for (ExternalUser &EU : ExternalUses) {
     // Uses by ephemeral values are free (because the ephemeral value will be
     // removed prior to code generation, and so the extraction will be
@@ -11698,7 +11699,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
       // Can use original instruction, if no operands vectorized or they are
       // marked as externally used already.
       auto *Inst = cast<Instruction>(EU.Scalar);
-      bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
+      InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
+      auto OperandIsScalar = [&](Value *V) {
         if (!getTreeEntry(V)) {
           // Some extractelements might be not vectorized, but
           // transformed into shuffle and removed from the function,
@@ -11708,9 +11710,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           return true;
         }
         return ValueToExtUses->contains(V);
-      });
+      };
+      bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
+      bool CanBeUsedAsScalarCast = false;
+      if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
+        if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
+            Op && all_of(Op->operands(), OperandIsScalar)) {
+          InstructionCost OpCost = TTI->getInstructionCost(Op, CostKind);
+          if (ScalarCost + OpCost <= ExtraCost) {
+            CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
+            ScalarCost += OpCost;
+          }
+        }
+      }
       if (CanBeUsedAsScalar) {
-        InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
         bool KeepScalar = ScalarCost <= ExtraCost;
         // Try to keep original scalar if the user is the phi node from the same
         // block as the root phis, currently vectorized. It allows to keep
@@ -11766,12 +11779,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           ExtraCost = ScalarCost;
           if (!IsPhiInLoop(EU))
             ExtractsCount[Entry].insert(Inst);
+          if (CanBeUsedAsScalarCast)
+            ScalarOpsFromCasts.insert(Inst->getOperand(0));
         }
       }
     }
 
     ExtractCost += ExtraCost;
   }
+  // Insert externals for extract of operands of casts to be emitted as scalars
+  // instead of extractelement.
+  for (Value *V : ScalarOpsFromCasts) {
+    ExternalUsesAsOriginalScalar.insert(V);
+    ExternalUses.emplace_back(V, nullptr, getTreeEntry(V)->findLaneForValue(V));
+  }
   // Add reduced value cost, if resized.
   if (!VectorizedVals.empty()) {
     const TreeEntry &Root = *VectorizableTree.front();
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 803af4d166b213..b38c636ccaf5da 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -54,38 +54,43 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP192:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP192]], [[TMP22]]
-; CHECK-NEXT:    [[TMP29:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP40:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP40]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; CHECK-NEXT:    [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP42:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32>
+; CHECK-NEXT:    [[TMP27:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP49]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
-; CHECK-NEXT:    [[TMP31:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
-; CHECK-NEXT:    [[TMP33:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
-; CHECK-NEXT:    [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP31:%.*]] = add <2 x i32> [[TMP25]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; CHECK-NEXT:    [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = sub <2 x i32> [[TMP50]], [[TMP51]]
+; CHECK-NEXT:    [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP51]], [[TMP39]]
+; CHECK-NEXT:    [[TMP40:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP56:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT:    [[TMP56:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
-; CHECK-NEXT:    [[TMP60:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP56]], i32 1
+; CHECK-NEXT:    [[TMP59:%.*]] = add <2 x i32> [[TMP37]], [[TMP52]]
+; CHECK-NEXT:    [[TMP63:%.*]] = add <2 x i32> [[TMP59]], [[TMP31]]
+; CHECK-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP31]], [[TMP59]]
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP63]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP63]], i32 1
 ; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
 ; CHECK-NEXT:    [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP60]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
-; CHECK-NEXT:    [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
@@ -94,52 +99,55 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP52:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT:    [[TMP76:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT:    [[TMP77:%.*]] = zext i8 [[TMP76]] to i32
 ; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT:    [[TMP55:%.*]] = sub <2 x i32> [[TMP52]], [[TMP61]]
+; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT:    [[TMP58:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP55]]
-; CHECK-NEXT:    [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT:    [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; CHECK-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP74]], [[TMP82]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add <2 x i32> [[TMP46]], [[TMP58]]
+; CHECK-NEXT:    [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT:    [[TMP68:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
+; CHECK-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP94]], [[TMP103]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
 ; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT:    [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP72:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP94]], [[TMP72]]
+; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; CHECK-NEXT:    [[TMP74:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
+; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP108]], [[TMP75]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP75:%.*]] = add <2 x i32> [[TMP70]], [[TMP85]]
-; CHECK-NEXT:    [[TMP76:%.*]] = add <2 x i32> [[TMP75]], [[TMP62]]
-; CHECK-NEXT:    [[TMP106:%.*]] = sub <2 x i32> [[TMP62]], [[TMP75]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; CHECK-NEXT:    [[TMP109:%.*]] = add <2 x i32> [[TMP70]], [[TMP106]]
+; CHECK-NEXT:    [[TMP79:%.*]] = add <2 x i32> [[TMP109]], [[TMP82]]
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP82]], [[TMP109]]
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
 ; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
 ; CHECK-NEXT:    [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP115]], i32 0
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP115]], i32 1
 ; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i32> [[TMP52]], i32 0
+; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP80]], [[TMP81]]
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
+; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
 ; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
 ; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15
 ; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
 ; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
 ; CHECK-NEXT:    [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15
 ; CHECK-NEXT:    [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
 ; CHECK-NEXT:    [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535
-; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0
 ; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
 ; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
 ; CHECK-NEXT:    [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
@@ -147,60 +155,62 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[TMP107]], [[TMP68]]
-; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP68]], [[TMP107]]
+; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
 ; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
 ; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
 ; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
 ; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
 ; CHECK-NEXT:    [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP109]]
+; CHECK-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP117]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT:    [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT:    [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP121]], [[TMP100]]
+; CHECK-NEXT:    [[TMP119:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT:    [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; CHECK-NEXT:    [[TMP132:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/110537