[llvm] [SLP]Improve reordering for consts, splats and ops from same nodes + improved analysis. (PR #87091)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 29 10:26:40 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
Improved detection of const/splat candidates, their matching and analysis of instructions from same nodes.
Metric: size..text
Program size..text
results results0 diff
results results0 diff
test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 92952.00 93096.00 0.2%
test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 779832.00 780136.00 0.0%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 839923.00 840179.00 0.0%
test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 392708.00 392740.00 0.0%
test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1171131.00 1171147.00 0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1391089.00 1391073.00 -0.0%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1391089.00 1391073.00 -0.0%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12352780.00 12352636.00 -0.0%
MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE - small
reordering
External/SPEC/CINT2006/464.h264ref/464.h264ref - small better code after
reordering
MultiSource/Applications/JM/lencod/lencod - smaller code with less
shuffles
MultiSource/Applications/JM/ldecod/ldecod - same
External/SPEC/CFP2017rate/511.povray_r/511.povray_r - 2 extra loads
vectorized, smaller code
External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r - better code,
size increased because of more constant vectors.
External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s - same
External/SPEC/CFP2017rate/526.blender_r/526.blender_r - small change in
the vectorized code, some code a bit better, some a bit worse.
---
Patch is 55.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/87091.diff
11 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+61-16)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll (+106-106)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/addsub.ll (+4-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/entries-different-vf.ll (+5-5)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll (+3-5)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll (+2-2)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll (+5-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll (+8-8)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll (+2-2)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll (+15-17)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2875e71081d928..46243c60324a3d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1334,12 +1334,19 @@ class BoUpSLP {
return LookAheadHeuristics::ScoreSplat;
}
+ auto CheckSameEntryOrFail = [&]() {
+ if (const TreeEntry *TE1 = R.getTreeEntry(V1);
+ TE1 && TE1 == R.getTreeEntry(V2))
+ return LookAheadHeuristics::ScoreSplatLoads;
+ return LookAheadHeuristics::ScoreFail;
+ };
+
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2) {
if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
!LI2->isSimple())
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
std::optional<int> Dist = getPointersDiff(
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
@@ -1351,7 +1358,7 @@ class BoUpSLP {
FixedVectorType::get(LI1->getType(), NumLanes),
LI1->getAlign()))
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
}
// The distance is too large - still may be profitable to use masked
// loads/gathers.
@@ -1408,14 +1415,14 @@ class BoUpSLP {
}
return LookAheadHeuristics::ScoreAltOpcodes;
}
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
}
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (I1 && I2) {
if (I1->getParent() != I2->getParent())
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
Ops.push_back(I1);
Ops.push_back(I2);
@@ -1436,7 +1443,7 @@ class BoUpSLP {
if (isa<UndefValue>(V2))
return LookAheadHeuristics::ScoreUndef;
- return LookAheadHeuristics::ScoreFail;
+ return CheckSameEntryOrFail();
}
/// Go through the operands of \p LHS and \p RHS recursively until
@@ -1599,6 +1606,7 @@ class BoUpSLP {
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
+ const Loop *L = nullptr;
/// \returns the operand data at \p OpIdx and \p Lane.
OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -1767,8 +1775,9 @@ class BoUpSLP {
// Track if the operand must be marked as used. If the operand is set to
// Score 1 explicitly (because of non power-of-2 unique scalars, we may
// want to reestimate the operands again on the following iterations).
- bool IsUsed =
- RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
+ bool IsUsed = RMode == ReorderingMode::Splat ||
+ RMode == ReorderingMode::Constant ||
+ RMode == ReorderingMode::Load;
// Iterate through all unused operands and look for the best.
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
// Get the operand at Idx and Lane.
@@ -1789,23 +1798,44 @@ class BoUpSLP {
// Look for an operand that matches the current mode.
switch (RMode) {
case ReorderingMode::Load:
- case ReorderingMode::Constant:
case ReorderingMode::Opcode: {
bool LeftToRight = Lane > LastLane;
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
OpIdx, Idx, IsUsed);
- if (Score > static_cast<int>(BestOp.Score)) {
+ if (Score > static_cast<int>(BestOp.Score) ||
+ (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
+ Idx == OpIdx)) {
BestOp.Idx = Idx;
BestOp.Score = Score;
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
}
break;
}
+ case ReorderingMode::Constant:
+ if (isa<Constant>(Op) ||
+ (!BestOp.Score && L && L->isLoopInvariant(Op))) {
+ BestOp.Idx = Idx;
+ if (isa<Constant>(Op)) {
+ BestOp.Score = LookAheadHeuristics::ScoreConstants;
+ BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
+ LookAheadHeuristics::ScoreConstants;
+ }
+ if (isa<UndefValue>(Op) || !isa<Constant>(Op))
+ IsUsed = false;
+ }
+ break;
case ReorderingMode::Splat:
- if (Op == OpLastLane)
+ if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
+ IsUsed = Op == OpLastLane;
+ if (Op == OpLastLane) {
+ BestOp.Score = LookAheadHeuristics::ScoreSplat;
+ BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
+ LookAheadHeuristics::ScoreSplat;
+ }
BestOp.Idx = Idx;
+ }
break;
case ReorderingMode::Failed:
llvm_unreachable("Not expected Failed reordering mode.");
@@ -1999,6 +2029,8 @@ class BoUpSLP {
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
bool OpAPO = getData(OpIdx, Lane).APO;
+ bool IsInvariant = L && L->isLoopInvariant(Op);
+ unsigned Cnt = 0;
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
if (Ln == Lane)
continue;
@@ -2008,22 +2040,37 @@ class BoUpSLP {
OperandData &Data = getData(OpI, Ln);
if (Data.APO != OpAPO || Data.IsUsed)
continue;
- if (Data.V == Op) {
+ Value *OpILane = getValue(OpI, Lane);
+ bool IsConstantOp = isa<Constant>(OpILane);
+ if (Data.V == Op ||
+ (!IsConstantOp &&
+ ((Lns > 2 && isa<Constant>(Data.V)) ||
+ (Lns == 2 &&
+ !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
+ .getOpcode() &&
+ isa<Constant>(Data.V)))) ||
+ (IsInvariant && !isa<Constant>(Data.V) &&
+ !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
+ L->isLoopInvariant(Data.V))) {
FoundCandidate = true;
- Data.IsUsed = true;
+ Data.IsUsed = Data.V == Op;
+ if (Data.V == Op)
+ ++Cnt;
break;
}
}
if (!FoundCandidate)
return false;
}
- return true;
+ return getNumLanes() == 2 || Cnt > 1;
}
public:
/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
- : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
+ : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
+ L(R.LI->getLoopFor(
+ (cast<Instruction>(RootVL.front())->getParent()))) {
// Append all the operands of RootVL.
appendOperandsOfVL(RootVL);
}
@@ -2155,8 +2202,6 @@ class BoUpSLP {
// getBestOperand().
swap(OpIdx, *BestIdx, Lane);
} else {
- // We failed to find a best operand, set mode to 'Failed'.
- ReorderingModes[OpIdx] = ReorderingMode::Failed;
// Enable the second pass.
StrategyFailed = true;
}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index d87bdfe2689916..aa9a070a794509 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -37,10 +37,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP101]], [[TMP18]]
+; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
@@ -64,15 +64,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]]
; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], [[TMP31]]
-; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
-; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
-; CHECK-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP40]], [[TMP39]]
-; CHECK-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]]
-; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
-; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
-; CHECK-NEXT: [[CONV:%.*]] = add i32 [[TMP42]], [[TMP41]]
-; CHECK-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]]
-; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]]
+; CHECK-NEXT: [[ADD44_2:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; CHECK-NEXT: [[CONV:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; CHECK-NEXT: [[ADD44_3:%.*]] = add i32 [[CONV]], [[ADD44_2]]
+; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[CONV]]
+; CHECK-NEXT: [[SUB45_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
+; CHECK-NEXT: [[SUB47_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
+; CHECK-NEXT: [[ADD46_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
+; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
+; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_3]]
; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
@@ -104,10 +104,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]]
-; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
-; CHECK-NEXT: [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
-; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
+; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
+; CHECK-NEXT: [[TMP190:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]]
; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
@@ -115,19 +115,19 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP79]], 15
; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
+; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[ADD46_2]], 15
; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
-; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP107]], 15
-; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
-; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
+; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[TMP107]], 15
+; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
+; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15
; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[CONV1]], 15
-; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
-; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
+; CHECK-NEXT: [[SHR_I49_6:%.*]] = lshr i32 [[CONV1]], 15
+; CHECK-NEXT: [[AND_I50_6:%.*]] = and i32 [[SHR_I49_6]], 65537
+; CHECK-NEXT: [[MUL_I51_6:%.*]] = mul i32 [[AND_I50_6]], 65535
; CHECK-NEXT: [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1
@@ -151,21 +151,21 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]]
; CHECK-NEXT: [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT: [[TMP103:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
-; CHECK-NEXT: [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP103]]
+; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
+; CHECK-NEXT: [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP101]]
; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[TMP165:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
+; CHECK-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
; CHECK-NEXT: [[TMP105:%.*]] = sub <2 x i32> [[TMP200]], [[TMP104]]
-; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
-; CHECK-NEXT: [[TMP143:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
-; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP143]], [[TMP238]]
-; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
-; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP143]], 15
-; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
-; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
+; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
+; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP108]], [[TMP238]]
+; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15
; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
+; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[TMP142]], 15
+; CHECK-NEXT: [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537
+; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
; CHECK-NEXT: [[TMP109:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i8> poison, i8 [[TMP12]], i32 0
@@ -185,7 +185,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 1, i64 3>
; CHECK-NEXT: [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP153:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
; CHECK-NEXT: [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 5, i64 7>
; CHECK-NEXT: [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
@@ -195,15 +195,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]]
; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP153]]
+; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP144]]
; CHECK-NEXT: [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]]
; CHECK-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]]
-; CHECK-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT: [[TMP257:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]]
-; CHECK-NEXT: [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP139]]
-; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP257]], i32 0
-; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP257]], i32 1
+; CHECK-NEXT: [[TMP155:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
+; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP139]], [[TMP155]]
+; CHECK-NEXT: [[TMP189:%.*]] = sub <2 x i32> [[TMP155]], [[TMP139]]
+; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
+; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]]
; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15
; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
@@ -220,37 +220,37 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_2]], [[ADD103]]
; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP79]]
; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_3]], [[ADD105]]
-; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]]
+; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]]
-; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP143]]
+; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
+; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP108]]
; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
; CHECK-NEXT: [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB47_2]], i32 1
-; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_2]], i32 1
-; CHECK-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
-; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/87091
More information about the llvm-commits
mailing list