[llvm] [SLP]Try to keep operand of external casts as scalars, if profitable (PR #110537)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 30 10:13:43 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
If the cost of original scalar instruction + cast is better than the
extractelement from the vector cast instruction, better to keep original
scalar instructions, where possible
---
Patch is 54.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110537.diff
3 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+24-3)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll (+195-208)
- (modified) llvm/test/Transforms/SLPVectorizer/insertelement-uses-vectorized-index.ll (+2-1)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e45fcb2b5c790c..72fa37bd849bd9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11563,6 +11563,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
+ SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
for (ExternalUser &EU : ExternalUses) {
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
@@ -11698,7 +11699,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// Can use original instruction, if no operands vectorized or they are
// marked as externally used already.
auto *Inst = cast<Instruction>(EU.Scalar);
- bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
+ InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
+ auto OperandIsScalar = [&](Value *V) {
if (!getTreeEntry(V)) {
// Some extractelements might be not vectorized, but
// transformed into shuffle and removed from the function,
@@ -11708,9 +11710,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
return true;
}
return ValueToExtUses->contains(V);
- });
+ };
+ bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
+ bool CanBeUsedAsScalarCast = false;
+ if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
+ if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
+ Op && all_of(Op->operands(), OperandIsScalar)) {
+ InstructionCost OpCost = TTI->getInstructionCost(Op, CostKind);
+ if (ScalarCost + OpCost <= ExtraCost) {
+ CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
+ ScalarCost += OpCost;
+ }
+ }
+ }
if (CanBeUsedAsScalar) {
- InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
bool KeepScalar = ScalarCost <= ExtraCost;
// Try to keep original scalar if the user is the phi node from the same
// block as the root phis, currently vectorized. It allows to keep
@@ -11766,12 +11779,20 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
ExtraCost = ScalarCost;
if (!IsPhiInLoop(EU))
ExtractsCount[Entry].insert(Inst);
+ if (CanBeUsedAsScalarCast)
+ ScalarOpsFromCasts.insert(Inst->getOperand(0));
}
}
}
ExtractCost += ExtraCost;
}
+ // Insert externals for extract of operands of casts to be emitted as scalars
+ // instead of extractelement.
+ for (Value *V : ScalarOpsFromCasts) {
+ ExternalUsesAsOriginalScalar.insert(V);
+ ExternalUses.emplace_back(V, nullptr, getTreeEntry(V)->findLaneForValue(V));
+ }
// Add reduced value cost, if resized.
if (!VectorizedVals.empty()) {
const TreeEntry &Root = *VectorizableTree.front();
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 803af4d166b213..b38c636ccaf5da 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -54,38 +54,43 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5
; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5
; CHECK-NEXT: [[TMP19:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP192:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
-; CHECK-NEXT: [[TMP21:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP192]], [[TMP22]]
-; CHECK-NEXT: [[TMP29:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP40:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; CHECK-NEXT: [[TMP26:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
-; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP40]], [[TMP27]]
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT: [[TMP22:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT: [[TMP30:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
+; CHECK-NEXT: [[TMP42:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP49]]
; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
-; CHECK-NEXT: [[TMP31:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
-; CHECK-NEXT: [[TMP33:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
-; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP49]]
-; CHECK-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
-; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP31:%.*]] = add <2 x i32> [[TMP25]], [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; CHECK-NEXT: [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT: [[TMP35:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
+; CHECK-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP50]], [[TMP51]]
+; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP51]], [[TMP39]]
+; CHECK-NEXT: [[TMP40:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
+; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP56]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT: [[TMP56:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
-; CHECK-NEXT: [[TMP60:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP56]], i32 0
-; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP56]], i32 1
+; CHECK-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP37]], [[TMP52]]
+; CHECK-NEXT: [[TMP63:%.*]] = add <2 x i32> [[TMP59]], [[TMP31]]
+; CHECK-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP31]], [[TMP59]]
+; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP63]], i32 0
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP63]], i32 1
; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]]
-; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP60]], i32 0
-; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
-; CHECK-NEXT: [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]]
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
+; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP47]], [[TMP48]]
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
@@ -94,52 +99,55 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP52:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP76:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP76]] to i32
; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT: [[TMP55:%.*]] = sub <2 x i32> [[TMP52]], [[TMP61]]
+; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT: [[TMP58:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT: [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
-; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP55]]
-; CHECK-NEXT: [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP74]], [[TMP82]]
+; CHECK-NEXT: [[TMP82:%.*]] = add <2 x i32> [[TMP46]], [[TMP58]]
+; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT: [[TMP68:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32>
+; CHECK-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP94]], [[TMP103]]
; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT: [[TMP103:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP72:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
-; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP94]], [[TMP72]]
+; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
+; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP108]], [[TMP75]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP70]], [[TMP85]]
-; CHECK-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP75]], [[TMP62]]
-; CHECK-NEXT: [[TMP106:%.*]] = sub <2 x i32> [[TMP62]], [[TMP75]]
-; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
-; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; CHECK-NEXT: [[TMP109:%.*]] = add <2 x i32> [[TMP70]], [[TMP106]]
+; CHECK-NEXT: [[TMP79:%.*]] = add <2 x i32> [[TMP109]], [[TMP82]]
+; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP82]], [[TMP109]]
+; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
-; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP115]], i32 0
+; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP115]], i32 1
; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i32> [[TMP52]], i32 0
+; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP80]], [[TMP81]]
+; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
+; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15
; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT: [[TMP83:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
+; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
+; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15
; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
; CHECK-NEXT: [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535
-; CHECK-NEXT: [[TMP84:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0
; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
; CHECK-NEXT: [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
@@ -147,60 +155,62 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[TMP107]], [[TMP68]]
-; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP68]], [[TMP107]]
+; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
+; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
; CHECK-NEXT: [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP109]]
+; CHECK-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
+; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP90]], [[TMP117]]
; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32>
-; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; CHECK-NEXT: [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP121]], [[TMP100]]
+; CHECK-NEXT: [[TMP119:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT: [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; CHECK-NEXT: [[TMP132:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/110537
More information about the llvm-commits
mailing list