[llvm] b16e694 - [SLP]Try to keep operand of external casts as scalars, if profitable
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 1 10:35:46 PDT 2024
Author: Alexey Bataev
Date: 2024-10-01T13:35:42-04:00
New Revision: b16e69494811c5908c1ab2a8cf56b70b5834698c
URL: https://github.com/llvm/llvm-project/commit/b16e69494811c5908c1ab2a8cf56b70b5834698c
DIFF: https://github.com/llvm/llvm-project/commit/b16e69494811c5908c1ab2a8cf56b70b5834698c.diff
LOG: [SLP]Try to keep operand of external casts as scalars, if profitable
If the cost of original scalar instruction + cast is better than the
extractelement from the vector cast instruction, better to keep original
scalar instructions, where possible
Reviewers: RKSimon
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/110537
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 54a1b99606bcb2..15e798bd6c98f9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11571,6 +11571,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
+ SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
for (ExternalUser &EU : ExternalUses) {
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
@@ -11706,7 +11707,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// Can use original instruction, if no operands vectorized or they are
// marked as externally used already.
auto *Inst = cast<Instruction>(EU.Scalar);
- bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
+ InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
+ auto OperandIsScalar = [&](Value *V) {
if (!getTreeEntry(V)) {
// Some extractelements might be not vectorized, but
// transformed into shuffle and removed from the function,
@@ -11716,9 +11718,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return true;
}
return ValueToExtUses->contains(V);
- });
+ };
+ bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
+ bool CanBeUsedAsScalarCast = false;
+ if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
+ if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
+ Op && all_of(Op->operands(), OperandIsScalar)) {
+ InstructionCost OpCost =
+ (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
+ ? TTI->getInstructionCost(Op, CostKind)
+ : 0;
+ if (ScalarCost + OpCost <= ExtraCost) {
+ CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
+ ScalarCost += OpCost;
+ }
+ }
+ }
if (CanBeUsedAsScalar) {
- InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
bool KeepScalar = ScalarCost <= ExtraCost;
// Try to keep original scalar if the user is the phi node from the same
// block as the root phis, currently vectorized. It allows to keep
@@ -11774,12 +11790,34 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
ExtraCost = ScalarCost;
if (!IsPhiInLoop(EU))
ExtractsCount[Entry].insert(Inst);
+ if (CanBeUsedAsScalarCast) {
+ ScalarOpsFromCasts.insert(Inst->getOperand(0));
+ // Update the users of the operands of the cast operand to avoid
+ // compiler crash.
+ if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
+ for_each(IOp->operands(), [&](Value *V) {
+ auto It = ValueToExtUses->find(V);
+ if (It != ValueToExtUses->end()) {
+ // Replace all uses to avoid compiler crash.
+ ExternalUses[It->second].User = nullptr;
+ }
+ });
+ }
+ }
}
}
}
ExtractCost += ExtraCost;
}
+ // Insert externals for extract of operands of casts to be emitted as scalars
+ // instead of extractelement.
+ for (Value *V : ScalarOpsFromCasts) {
+ ExternalUsesAsOriginalScalar.insert(V);
+ if (const TreeEntry *E = getTreeEntry(V)) {
+ ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
+ }
+ }
// Add reduced value cost, if resized.
if (!VectorizedVals.empty()) {
const TreeEntry &Root = *VectorizableTree.front();
@@ -13095,7 +13133,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
UniqueBases.insert(VecBase);
// If the only one use is vectorized - can delete the extractelement
// itself.
- if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
+ if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
+ (NumParts != 1 && count(E->Scalars, EI) > 1) ||
any_of(EI->users(), [&](User *U) {
const TreeEntry *UTE = R.getTreeEntry(U);
return !UTE || R.MultiNodeScalars.contains(U) ||
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 803af4d166b213..b38c636ccaf5da 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -54,38 +54,43 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5
; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5
; CHECK-NEXT: [[TMP19:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP192:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
-; CHECK-NEXT: [[TMP21:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP192]], [[TMP22]]
-; CHECK-NEXT: [[TMP29:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP40:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; CHECK-NEXT: [[TMP26:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
-; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP40]], [[TMP27]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT: [[TMP22:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT: [[TMP30:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
+; CHECK-NEXT: [[TMP42:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP49]]
; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
-; CHECK-NEXT: [[TMP31:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
-; CHECK-NEXT: [[TMP33:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP49]]
-; CHECK-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
-; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP31:%.*]] = add <2 x i32> [[TMP25]], [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; CHECK-NEXT: [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT: [[TMP35:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
+; CHECK-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP51]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
+; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP56]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT: [[TMP56:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
-; CHECK-NEXT: [[TMP60:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP56]], i32 0
-; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP56]], i32 1
+; CHECK-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP37]], [[TMP52]]
+; CHECK-NEXT: [[TMP63:%.*]] = add <2 x i32> [[TMP59]], [[TMP31]]
+; CHECK-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP31]], [[TMP59]]
+; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP63]], i32 0
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP63]], i32 1
; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP60]], i32 0
-; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
-; CHECK-NEXT: [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
+; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
@@ -94,52 +99,55 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP76:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP76]] to i32
; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT: [[TMP55:%.*]] = sub <2 x i32> [[TMP52]], [[TMP61]]
+; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT: [[TMP58:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT: [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
-; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP55]]
-; CHECK-NEXT: [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP74]], [[TMP82]]
+; CHECK-NEXT: [[TMP82:%.*]] = add <2 x i32> [[TMP46]], [[TMP58]]
+; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT: [[TMP68:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
+; CHECK-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP94]], [[TMP103]]
; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT: [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP72:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
-; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP94]], [[TMP72]]
+; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
+; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP108]], [[TMP75]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP70]], [[TMP85]]
-; CHECK-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP75]], [[TMP62]]
-; CHECK-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP62]], [[TMP75]]
-; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
-; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; CHECK-NEXT: [[TMP109:%.*]] = add <2 x i32> [[TMP70]], [[TMP106]]
+; CHECK-NEXT: [[TMP79:%.*]] = add <2 x i32> [[TMP109]], [[TMP82]]
+; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP82]], [[TMP109]]
+; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP115]], i32 0
+; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP115]], i32 1
; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i32> [[TMP52]], i32 0
+; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP80]], [[TMP81]]
+; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
+; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15
; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT: [[TMP83:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
+; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15
; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
; CHECK-NEXT: [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535
-; CHECK-NEXT: [[TMP84:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0
; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
; CHECK-NEXT: [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
@@ -147,60 +155,62 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[TMP107]], [[TMP68]]
-; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP68]], [[TMP107]]
+; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
+; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP109]]
+; CHECK-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
+; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP117]]
; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT: [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP121]], [[TMP100]]
+; CHECK-NEXT: [[TMP119:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT: [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; CHECK-NEXT: [[TMP132:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP132]] to <2 x i32>
+; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP131]], [[TMP100]]
; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT: [[TMP132:%.*]] = sub <2 x i32> [[TMP97]], [[TMP112]]
-; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP132]]
+; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP97]], [[TMP121]]
+; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP133]]
; CHECK-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP86]], [[TMP108]]
-; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP133]]
+; CHECK-NEXT: [[TMP107:%.*]] = sub <2 x i32> [[TMP86]], [[TMP112]]
+; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP107]]
; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
+; CHECK-NEXT: [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
; CHECK-NEXT: [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
+; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP111]]
; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15
; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
; CHECK-NEXT: [[TMP114:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
; CHECK-NEXT: [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT: [[TMP145:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
; CHECK-NEXT: [[TMP118:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP117]], [[TMP119]]
+; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP145]], [[TMP120]]
; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP123:%.*]] = insertelement <2 x i32> [[TMP122]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP123]], [[TMP115]]
-; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP134]]
+; CHECK-NEXT: [[TMP157:%.*]] = sub <2 x i32> [[TMP123]], [[TMP134]]
+; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP157]]
; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> <i32 1, i32 poison>
; CHECK-NEXT: [[TMP127:%.*]] = insertelement <2 x i8> [[TMP126]], i8 [[TMP14]], i32 1
-; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT: [[TMP158:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
; CHECK-NEXT: [[TMP129:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP128]]
+; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP158]]
; CHECK-NEXT: [[TMP135:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP16]], i32 1
; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
@@ -214,23 +224,25 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SUB47_1:%.*]] = sub i32 [[TMP138]], [[TMP171]]
; CHECK-NEXT: [[TMP140:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP105]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP157:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
+; CHECK-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]]
; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> <i32 3, i32 1>
; CHECK-NEXT: [[TMP144:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT: [[TMP145:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
-; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
-; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP145]], i32 1
+; CHECK-NEXT: [[TMP165:%.*]] = add <2 x i32> [[TMP143]], [[TMP144]]
+; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
+; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
+; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP146]], [[TMP98]]
; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15
; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT: [[TMP164:%.*]] = add <2 x i32> [[TMP145]], [[TMP157]]
-; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP157]], [[TMP145]]
-; CHECK-NEXT: [[TMP165:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
+; CHECK-NEXT: [[TMP167:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
+; CHECK-NEXT: [[TMP166:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
+; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP166]], [[TMP167]]
+; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
+; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]]
; CHECK-NEXT: [[TMP151:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP151]], i32 [[SUB45_1]], i32 0
-; CHECK-NEXT: [[TMP180:%.*]] = add <2 x i32> [[TMP165]], [[TMP152]]
-; CHECK-NEXT: [[TMP154:%.*]] = sub <2 x i32> [[TMP152]], [[TMP165]]
-; CHECK-NEXT: [[TMP166:%.*]] = extractelement <2 x i32> [[TMP145]], i32 0
+; CHECK-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0
+; CHECK-NEXT: [[TMP168:%.*]] = sub <2 x i32> [[TMP152]], [[TMP154]]
; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP166]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
@@ -240,74 +252,48 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP147:%.*]] = lshr <2 x i32> [[TMP113]], <i32 15, i32 15>
; CHECK-NEXT: [[TMP148:%.*]] = and <2 x i32> [[TMP147]], <i32 65537, i32 65537>
; CHECK-NEXT: [[TMP149:%.*]] = mul <2 x i32> [[TMP148]], <i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP167:%.*]] = shufflevector <2 x i32> [[TMP164]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: [[TMP160:%.*]] = insertelement <2 x i32> [[TMP167]], i32 [[ADD48_3]], i32 0
-; CHECK-NEXT: [[TMP161:%.*]] = insertelement <2 x i32> [[TMP164]], i32 [[ADD48_2]], i32 0
-; CHECK-NEXT: [[TMP162:%.*]] = add <2 x i32> [[TMP160]], [[TMP161]]
-; CHECK-NEXT: [[TMP163:%.*]] = sub <2 x i32> [[TMP161]], [[TMP160]]
-; CHECK-NEXT: [[ADD95:%.*]] = extractelement <2 x i32> [[TMP162]], i32 0
-; CHECK-NEXT: [[ADD79:%.*]] = extractelement <2 x i32> [[TMP162]], i32 1
-; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD79]]
-; CHECK-NEXT: [[SUB105:%.*]] = sub i32 [[ADD79]], [[ADD95]]
-; CHECK-NEXT: [[ADD94:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
-; CHECK-NEXT: [[ADD78:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
-; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[ADD94]], [[ADD78]]
+; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
+; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
+; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
+; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
+; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]]
; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]]
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]]
-; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB105]]
+; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP166]]
-; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]]
+; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I56_1]], [[SUB106]]
; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP146]]
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; CHECK-NEXT: [[TMP168:%.*]] = extractelement <2 x i32> [[TMP180]], i32 0
-; CHECK-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP180]], i32 1
-; CHECK-NEXT: [[SUB102:%.*]] = add i32 [[TMP168]], [[TMP169]]
-; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
-; CHECK-NEXT: [[TMP170:%.*]] = insertelement <2 x i32> [[TMP180]], i32 [[ADD55_2]], i32 0
-; CHECK-NEXT: [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP180]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> [[TMP173]], i32 [[ADD55_3]], i32 0
-; CHECK-NEXT: [[TMP175:%.*]] = sub <2 x i32> [[TMP170]], [[TMP172]]
-; CHECK-NEXT: [[TMP174:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0
-; CHECK-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1
-; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP174]], [[TMP159]]
-; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
-; CHECK-NEXT: [[TMP244:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
-; CHECK-NEXT: [[TMP177:%.*]] = insertelement <2 x i32> [[TMP244]], i32 [[ADD55_3]], i32 1
-; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
-; CHECK-NEXT: [[TMP179:%.*]] = insertelement <2 x i32> [[TMP197]], i32 [[ADD55_2]], i32 1
-; CHECK-NEXT: [[TMP181:%.*]] = add <2 x i32> [[TMP177]], [[TMP179]]
-; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[SUB102]], [[ADD94_1]]
-; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[TMP159]], [[TMP174]]
-; CHECK-NEXT: [[TMP193:%.*]] = insertelement <2 x i32> [[TMP192]], i32 [[SUB102]], i32 1
-; CHECK-NEXT: [[TMP182:%.*]] = xor <2 x i32> [[TMP181]], [[TMP193]]
-; CHECK-NEXT: [[TMP183:%.*]] = add <2 x i32> [[TMP181]], [[TMP193]]
-; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP182]], <2 x i32> [[TMP183]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP185:%.*]] = insertelement <2 x i32> poison, i32 [[ADD113]], i32 0
-; CHECK-NEXT: [[TMP186:%.*]] = insertelement <2 x i32> [[TMP185]], i32 [[MUL_I_2]], i32 1
-; CHECK-NEXT: [[TMP202:%.*]] = add <2 x i32> [[TMP184]], [[TMP186]]
-; CHECK-NEXT: [[TMP203:%.*]] = extractelement <2 x i32> [[TMP202]], i32 1
-; CHECK-NEXT: [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP32]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
+; CHECK-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]]
+; CHECK-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
+; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
+; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
+; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
+; CHECK-NEXT: [[TMP203:%.*]] = add i32 [[MUL_I_2]], [[ADD103_1]]
; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[TMP203]], [[TMP83]]
+; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]]
+; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]]
; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_2]], [[SUB104_1]]
; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]]
-; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP202]], i32 0
-; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[TMP190]], [[XOR_I_1]]
+; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
+; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
-; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP191]], [[TMP205]]
+; CHECK-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
+; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP169]], [[TMP160]]
; CHECK-NEXT: [[TMP196:%.*]] = insertelement <2 x i32> [[TMP141]], i32 [[SUB51_2]], i32 0
; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP141]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> [[TMP194]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT: [[TMP206:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
+; CHECK-NEXT: [[TMP164:%.*]] = sub <2 x i32> [[TMP196]], [[TMP195]]
; CHECK-NEXT: [[TMP201:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP201]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_6]], i32 0
@@ -315,8 +301,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP225:%.*]] = add <2 x i32> [[TMP198]], [[TMP200]]
; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP198]], [[TMP200]]
; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP204:%.*]] = extractelement <2 x i32> [[TMP206]], i32 0
-; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP206]], i32 1
+; CHECK-NEXT: [[TMP204:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0
+; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1
; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP204]], [[TMP212]]
; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP212]], [[TMP204]]
; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
@@ -329,13 +315,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP98]]
; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT: [[TMP208:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP208]]
-; CHECK-NEXT: [[TMP209:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP209]]
+; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
+; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP176]]
+; CHECK-NEXT: [[TMP177:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
+; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP177]]
; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT: [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP154]], i32 0
-; CHECK-NEXT: [[SUB59:%.*]] = extractelement <2 x i32> [[TMP154]], i32 1
+; CHECK-NEXT: [[SUB59_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0
+; CHECK-NEXT: [[SUB59:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1
; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
; CHECK-NEXT: [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -360,10 +346,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT: [[TMP228:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
-; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP228]]
-; CHECK-NEXT: [[TMP229:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
-; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP229]]
+; CHECK-NEXT: [[TMP192:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
+; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP192]]
+; CHECK-NEXT: [[TMP193:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
+; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP193]]
; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
; CHECK-NEXT: ret i32 [[ADD113_3]]
;
@@ -422,35 +408,35 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP66]], [[TMP22]]
; THR15-NEXT: [[TMP24:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
; THR15-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
-; THR15-NEXT: [[TMP30:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
-; THR15-NEXT: [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP47]]
+; THR15-NEXT: [[TMP29:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; THR15-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
+; THR15-NEXT: [[TMP13:%.*]] = sub <2 x i32> [[TMP28]], [[TMP30]]
; THR15-NEXT: [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], <i32 16, i32 16>
; THR15-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP14]], [[TMP23]]
; THR15-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
; THR15-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
; THR15-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
; THR15-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
-; THR15-NEXT: [[TMP49:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; THR15-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
+; THR15-NEXT: [[TMP31:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
+; THR15-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
; THR15-NEXT: [[TMP33:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; THR15-NEXT: [[TMP78:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP59]], [[TMP78]]
+; THR15-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
+; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP47]], [[TMP50]]
; THR15-NEXT: [[TMP36:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
-; THR15-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
+; THR15-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
; THR15-NEXT: [[TMP38:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
; THR15-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; THR15-NEXT: [[TMP25:%.*]] = sub <2 x i32> [[TMP80]], [[TMP39]]
+; THR15-NEXT: [[TMP25:%.*]] = sub <2 x i32> [[TMP53]], [[TMP39]]
; THR15-NEXT: [[TMP26:%.*]] = shl <2 x i32> [[TMP25]], <i32 16, i32 16>
; THR15-NEXT: [[TMP27:%.*]] = add <2 x i32> [[TMP26]], [[TMP35]]
-; THR15-NEXT: [[TMP83:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
-; THR15-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP83]]
-; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP83]], [[TMP29]]
-; THR15-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
-; THR15-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
-; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP87]]
-; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP87]], [[TMP31]]
+; THR15-NEXT: [[TMP68:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
+; THR15-NEXT: [[TMP59:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
+; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP59]], [[TMP68]]
+; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP68]], [[TMP59]]
+; THR15-NEXT: [[TMP76:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; THR15-NEXT: [[TMP60:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
+; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP60]], [[TMP76]]
+; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP76]], [[TMP60]]
; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
; THR15-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[ADD46_2]]
; THR15-NEXT: [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
@@ -458,15 +444,17 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
; THR15-NEXT: [[TMP32:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT: [[TMP48:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; THR15-NEXT: [[TMP48:%.*]] = load i8, ptr null, align 1
+; THR15-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; THR15-NEXT: [[TMP63:%.*]] = zext i8 [[TMP48]] to i32
; THR15-NEXT: [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
-; THR15-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP48]], [[TMP50]]
+; THR15-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
+; THR15-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP49]], [[TMP61]]
; THR15-NEXT: [[TMP37:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
-; THR15-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; THR15-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; THR15-NEXT: [[TMP41:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]]
+; THR15-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
+; THR15-NEXT: [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; THR15-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
+; THR15-NEXT: [[TMP41:%.*]] = sub <2 x i32> [[TMP54]], [[TMP80]]
; THR15-NEXT: [[TMP42:%.*]] = shl <2 x i32> [[TMP41]], <i32 16, i32 16>
; THR15-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP93]]
; THR15-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
@@ -474,34 +462,33 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1
; THR15-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
; THR15-NEXT: [[TMP45:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT: [[TMP61:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
-; THR15-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; THR15-NEXT: [[TMP99:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
-; THR15-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
-; THR15-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP98]], [[TMP101]]
+; THR15-NEXT: [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
+; THR15-NEXT: [[TMP83:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
+; THR15-NEXT: [[TMP87:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
+; THR15-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP87]] to <2 x i32>
+; THR15-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP83]], [[TMP98]]
; THR15-NEXT: [[TMP51:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
; THR15-NEXT: [[TMP52:%.*]] = insertelement <2 x i8> [[TMP51]], i8 [[TMP45]], i32 1
-; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; THR15-NEXT: [[TMP102:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; THR15-NEXT: [[TMP70:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
-; THR15-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]]
+; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
+; THR15-NEXT: [[TMP70:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
+; THR15-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32>
+; THR15-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]]
; THR15-NEXT: [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
; THR15-NEXT: [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP65]]
; THR15-NEXT: [[TMP104:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; THR15-NEXT: [[TMP60:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; THR15-NEXT: [[ADD44_3:%.*]] = add i32 [[TMP60]], [[TMP104]]
-; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP104]], [[TMP60]]
-; THR15-NEXT: [[TMP76:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
-; THR15-NEXT: [[TMP62:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
-; THR15-NEXT: [[ADD46_3:%.*]] = add i32 [[TMP62]], [[TMP76]]
-; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP76]], [[TMP62]]
+; THR15-NEXT: [[TMP102:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
+; THR15-NEXT: [[ADD44_3:%.*]] = add i32 [[TMP102]], [[TMP104]]
+; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP104]], [[TMP102]]
+; THR15-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
+; THR15-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
+; THR15-NEXT: [[ADD46_3:%.*]] = add i32 [[TMP78]], [[TMP107]]
+; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP107]], [[TMP78]]
; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]]
; THR15-NEXT: [[SUB51_3:%.*]] = sub i32 [[ADD44_3]], [[ADD46_3]]
; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
; THR15-NEXT: [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]]
; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; THR15-NEXT: [[TMP63:%.*]] = extractelement <2 x i32> [[TMP48]], i32 0
; THR15-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP63]], 15
; THR15-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
; THR15-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
@@ -531,15 +518,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP81:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; THR15-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
; THR15-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; THR15-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
; THR15-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
+; THR15-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
; THR15-NEXT: [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP108]], [[TMP109]]
+; THR15-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
+; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP109]], [[TMP111]]
; THR15-NEXT: [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], <i32 16, i32 16>
; THR15-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
+; THR15-NEXT: [[TMP125:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
; THR15-NEXT: [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; THR15-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
; THR15-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
@@ -547,10 +534,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP94]], [[TMP96]]
; THR15-NEXT: [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
; THR15-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP111]]
+; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP125]]
; THR15-NEXT: [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP100]]
; THR15-NEXT: [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT: [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP107]]
+; THR15-NEXT: [[TMP120:%.*]] = sub <2 x i32> [[TMP92]], [[TMP108]]
; THR15-NEXT: [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP120]]
; THR15-NEXT: [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
; THR15-NEXT: [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
@@ -575,8 +562,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1
; THR15-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
; THR15-NEXT: [[TMP131:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT: [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP117]], [[TMP132]]
+; THR15-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
+; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP117]], [[TMP119]]
; THR15-NEXT: [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
; THR15-NEXT: [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
; THR15-NEXT: [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
@@ -589,11 +576,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP146]], [[TMP128]]
; THR15-NEXT: [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
; THR15-NEXT: [[SHL30_1:%.*]] = shl i32 [[TMP118]], 16
-; THR15-NEXT: [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
-; THR15-NEXT: [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP119]]
+; THR15-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; THR15-NEXT: [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP132]]
; THR15-NEXT: [[TMP133:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
-; THR15-NEXT: [[TMP125:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
-; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP133]], [[TMP125]]
+; THR15-NEXT: [[TMP147:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
+; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP133]], [[TMP147]]
; THR15-NEXT: [[TMP135:%.*]] = shufflevector <2 x i32> [[TMP121]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
; THR15-NEXT: [[TMP136:%.*]] = insertelement <2 x i32> [[TMP135]], i32 [[ADD43_1]], i32 1
; THR15-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP121]], i32 [[ADD31_1]], i32 1
@@ -679,9 +666,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP162]]
; THR15-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
; THR15-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP144]], i32 0
-; THR15-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP144]], i32 1
-; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP159]], [[TMP178]]
-; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP178]], [[TMP159]]
+; THR15-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP144]], i32 1
+; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP159]], [[TMP176]]
+; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP176]], [[TMP159]]
; THR15-NEXT: [[TMP163:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
; THR15-NEXT: [[TMP164:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> poison, <2 x i32> zeroinitializer
; THR15-NEXT: [[TMP165:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -706,8 +693,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
; THR15-NEXT: [[TMP175:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP175]]
-; THR15-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
-; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP176]]
+; THR15-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
+; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP178]]
; THR15-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
; THR15-NEXT: ret i32 [[ADD113_3]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
index 12fcaa1c87a9c5..e8548e0467381e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll
@@ -23,13 +23,16 @@ define void @test(ptr %0, i32 %add651) {
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 poison>, <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[ADD651]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP13]], <2 x i32> [[TMP10]], i64 2)
; CHECK-NEXT: [[TMP15:%.*]] = lshr <4 x i32> [[TMP14]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[SHR685:%.*]] = lshr i32 [[TMP2]], 1
; CHECK-NEXT: [[TMP16:%.*]] = trunc <4 x i32> [[TMP15]] to <4 x i16>
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP16]], <4 x i16> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: store <2 x i16> [[TMP17]], ptr [[ARRAYIDX689]], align 8
+; CHECK-NEXT: [[CONV686:%.*]] = trunc i32 [[SHR685]] to i16
+; CHECK-NEXT: store i16 [[CONV686]], ptr [[ARRAYIDX689]], align 8
+; CHECK-NEXT: [[ARRAYIDX727:%.*]] = getelementptr i8, ptr [[TMP4]], i64 7818
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i16> [[TMP16]], i32 2
+; CHECK-NEXT: store i16 [[TMP17]], ptr [[ARRAYIDX727]], align 2
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i16> [[TMP16]], i32 3
; CHECK-NEXT: store i16 [[TMP18]], ptr [[TMP4]], align 8
; CHECK-NEXT: store <4 x i16> [[TMP16]], ptr [[ARRAYIDX660]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll
index 94f973e606436e..0734fd80709f7a 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll
@@ -8,6 +8,8 @@ define void @test(ptr %0) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <2 x ptr> [[TMP1]] to <2 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP8]] to i32
; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr null to i64
; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
; CHECK-NEXT: switch i32 0, label %[[NEWFUNCROOT994:.*]] [
@@ -18,7 +20,6 @@ define void @test(ptr %0) {
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: ret void
; CHECK: [[NEWFUNCROOT994]]:
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[TMP5]], i64 [[TMP6]]
; CHECK-NEXT: ret void
;
More information about the llvm-commits
mailing list