[llvm] [VPlan] Unroll VPRedplicateRecipes by VF. (PR #142433)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 2 10:14:48 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Florian Hahn (fhahn)
<details>
<summary>Changes</summary>
---
Patch is 33.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142433.diff
16 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+1)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.cpp (+8)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+6)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+44-18)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+16)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+4)
- (modified) llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (+81)
- (modified) llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll (-6)
- (modified) llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll (+20-31)
- (modified) llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll (+2)
- (modified) llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll (+1-5)
- (modified) llvm/test/Transforms/LoopVectorize/iv_outside_user.ll (-5)
- (modified) llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll (+2)
- (modified) llvm/test/Transforms/LoopVectorize/struct-return.ll (+4-4)
- (modified) llvm/test/Transforms/LoopVectorize/uniform-blend.ll (+4)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e9ace195684b3..beeab51e0806a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7557,6 +7557,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// cost model is complete for better cost estimates.
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
OrigLoop->getHeader()->getContext());
+ VPlanTransforms::runPass(VPlanTransforms::unrollByVF, BestVPlan, BestVF);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 165b57c87beb1..c09970ef54103 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -261,6 +261,14 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
return Data.VPV2Scalars[Def][0];
}
+ // Look through BuildVector to avoid redundant extracts.
+ // TODO: Remove once replicate regions are unrolled explicitly.
+ auto *BV = dyn_cast<VPInstruction>(Def);
+ if (Lane.getKind() == VPLane::Kind::First && BV &&
+ BV->getOpcode() == VPInstruction::BuildVector) {
+ return get(BV->getOperand(Lane.getKnownLane()), true);
+ }
+
assert(hasVectorValue(Def));
auto *VecPart = Data.VPV2Vector[Def];
if (!VecPart->getType()->isVectorTy()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 44f0b6d964a6e..cd0ee979c5943 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -907,6 +907,12 @@ class VPInstruction : public VPRecipeWithIRFlags,
BranchOnCount,
BranchOnCond,
Broadcast,
+ /// Creates a vector containing all operands. The vector element count
+ /// matches the number of operands.
+ BuildVector,
+ /// Creates a struct of vectors containing all operands. The vector element
+ /// count matches the number of operands.
+ BuildStructVector,
ComputeFindLastIVResult,
ComputeReductionResult,
// Extracts the last lane from its operand if it is a vector, or the last
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 926490bfad7d0..66df7e3ebf802 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -104,6 +104,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::AnyOf:
+ case VPInstruction::BuildVector:
+ case VPInstruction::BuildStructVector:
return SetResultTyFromOp();
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a4831ea7c11f7..69b49430b6659 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -493,6 +493,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case Instruction::ExtractElement: {
assert(State.VF.isVector() && "Only extract elements from vectors");
+ return State.get(getOperand(0),
+ VPLane(cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
+ ->getZExtValue()));
Value *Vec = State.get(getOperand(0));
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
return Builder.CreateExtractElement(Vec, Idx, Name);
@@ -604,6 +607,34 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Builder.CreateVectorSplat(
State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
}
+ case VPInstruction::BuildVector: {
+ auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
+ Value *Res = PoisonValue::get(
+ toVectorizedTy(ScalarTy, ElementCount::getFixed(getNumOperands())));
+ for (const auto &[Idx, Op] : enumerate(operands()))
+ Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
+ State.Builder.getInt32(Idx));
+ return Res;
+ }
+ case VPInstruction::BuildStructVector: {
+ // For struct types, we need to build a new 'wide' struct type, where each
+ // element is widened.
+ auto *STy =
+ cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
+ Value *Res = PoisonValue::get(
+ toVectorizedTy(STy, ElementCount::getFixed(getNumOperands())));
+ for (const auto &[Idx, Op] : enumerate(operands())) {
+ for (unsigned I = 0, E = STy->getNumElements(); I != E; I++) {
+ Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I);
+ Value *VectorValue = Builder.CreateExtractValue(Res, I);
+ VectorValue =
+ Builder.CreateInsertElement(VectorValue, ScalarValue, Idx);
+ Res = Builder.CreateInsertValue(Res, VectorValue, I);
+ }
+ }
+ return Res;
+ }
+
case VPInstruction::ComputeFindLastIVResult: {
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
// and will be removed by breaking up the recipe further.
@@ -864,10 +895,11 @@ void VPInstruction::execute(VPTransformState &State) {
if (!hasResult())
return;
assert(GeneratedValue && "generate must produce a value");
- assert(
- (GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
- State.VF.isScalar()) &&
- "scalar value but not only first lane defined");
+ assert((((GeneratedValue->getType()->isVectorTy() ||
+ GeneratedValue->getType()->isStructTy()) ==
+ !GeneratesPerFirstLaneOnly) ||
+ State.VF.isScalar()) &&
+ "scalar value but not only first lane defined");
State.set(this, GeneratedValue,
/*IsScalar*/ GeneratesPerFirstLaneOnly);
}
@@ -881,6 +913,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::AnyOf:
+ case VPInstruction::BuildVector:
+ case VPInstruction::BuildStructVector:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLastElement:
@@ -999,6 +1033,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::Broadcast:
O << "broadcast";
break;
+ case VPInstruction::BuildVector:
+ O << "buildvector";
+ break;
+ case VPInstruction::BuildStructVector:
+ O << "buildstructvector";
+ break;
case VPInstruction::ExtractLastElement:
O << "extract-last-element";
break;
@@ -2758,20 +2798,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
scalarizeInstruction(UI, this, VPLane(0), State);
return;
}
-
- // A store of a loop varying value to a uniform address only needs the last
- // copy of the store.
- if (isa<StoreInst>(UI) && vputils::isSingleScalar(getOperand(1))) {
- auto Lane = VPLane::getLastLaneForVF(State.VF);
- scalarizeInstruction(UI, this, VPLane(Lane), State);
- return;
- }
-
- // Generate scalar instances for all VF lanes.
- assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- const unsigned EndLane = State.VF.getKnownMinValue();
- for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- scalarizeInstruction(UI, this, VPLane(Lane), State);
}
bool VPReplicateRecipe::shouldPack() const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5b42a9056b69e..d2c17b7f52b76 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1142,6 +1142,22 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
+ // Look through Extract(Last|Penultimate)Element (BuildVector ....).
+ if (match(&R,
+ m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) ||
+ match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
+ m_VPValue(A)))) {
+ unsigned Offset = cast<VPInstruction>(&R)->getOpcode() ==
+ VPInstruction::ExtractLastElement
+ ? 1
+ : 2;
+ auto *BV = dyn_cast<VPInstruction>(A);
+ if (BV && BV->getOpcode() == VPInstruction::BuildVector) {
+ Def->replaceAllUsesWith(BV->getOperand(BV->getNumOperands() - Offset));
+ return;
+ }
+ }
+
// Some simplifications can only be applied after unrolling. Perform them
// below.
if (!Plan->isUnrolled())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 34e2de4eb3b74..f45b7a7969d04 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -99,6 +99,10 @@ struct VPlanTransforms {
/// Explicitly unroll \p Plan by \p UF.
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
+ /// Explicitly unroll VPReplicateRecipes outside of replicate regions by \p
+ /// VF.
+ static void unrollByVF(VPlan &Plan, ElementCount VF);
+
/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
/// resulting plan to \p BestVF and \p BestUF.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index e1fb3d476c58d..331b395f30490 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -15,6 +15,7 @@
#include "VPlan.h"
#include "VPlanAnalysis.h"
#include "VPlanCFG.h"
+#include "VPlanHelpers.h"
#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
@@ -428,3 +429,83 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
VPlanTransforms::removeDeadRecipes(Plan);
}
+
+/// Create a single-scalar clone of RepR for lane \p Lane.
+static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
+ Type *IdxTy, VPReplicateRecipe *RepR,
+ VPLane Lane) {
+ // Collect the operands at Lane, creating extracts as needed.
+ SmallVector<VPValue *> NewOps;
+ for (VPValue *Op : RepR->operands()) {
+ if (vputils::isSingleScalar(Op)) {
+ NewOps.push_back(Op);
+ continue;
+ }
+ VPValue *Ext;
+ if (Lane.getKind() == VPLane::Kind::ScalableLast) {
+ Ext = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op});
+ } else {
+ // Look through buildvector to avoid unnecessary extracts.
+ auto *BV = dyn_cast<VPInstruction>(Op);
+ if (BV && BV->getOpcode() == VPInstruction::BuildVector) {
+ NewOps.push_back(BV->getOperand(Lane.getKnownLane()));
+ continue;
+ }
+ VPValue *Idx =
+ Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+ Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
+ }
+ NewOps.push_back(Ext);
+ }
+
+ auto *New =
+ new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
+ /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
+ New->insertBefore(RepR);
+ return New;
+}
+
+void VPlanTransforms::unrollByVF(VPlan &Plan, ElementCount VF) {
+ Type *IdxTy = IntegerType::get(
+ Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+ if (!RepR || RepR->isSingleScalar())
+ continue;
+
+ VPBuilder Builder(RepR);
+ SmallVector<VPValue *> LaneDefs;
+ // Stores to invariant addresses only need to store the last lane.
+ if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
+ vputils::isSingleScalar(RepR->getOperand(1))) {
+ cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF));
+ RepR->eraseFromParent();
+ continue;
+ }
+
+ /// Create single-scalar version of RepR for all lanes.
+ for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
+ LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
+
+ /// Users that only demand the first lane can use the definition for lane
+ /// 0.
+ RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
+ return U.onlyFirstLaneUsed(RepR);
+ });
+
+ Type *ResTy = RepR->getUnderlyingInstr()->getType();
+ // If needed, create a Build(Struct)Vector recipe to insert the scalar
+ // lane values into a vector.
+ if (!ResTy->isVoidTy()) {
+ VPValue *VecRes = Builder.createNaryOp(
+ ResTy->isStructTy() ? VPInstruction::BuildStructVector
+ : VPInstruction::BuildVector,
+ LaneDefs);
+ RepR->replaceAllUsesWith(VecRes);
+ }
+ RepR->eraseFromParent();
+ }
+ }
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 83e9d6146755d..743aedee38012 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -398,12 +398,6 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3
; CHECK-NEXT: store float [[TMP30]], ptr [[C:%.*]], align 4
-; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0
-; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP31]], align 4
-; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1
-; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP33]], align 4
-; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2
-; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP35]], align 4
; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3
; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP37]], align 4
; CHECK-NEXT: store float [[TMP36]], ptr [[B:%.*]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
index cdc7839bfc0f0..95258e65bbe3d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
@@ -32,42 +32,31 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 104
; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 112
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 120
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]]
+; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP0]]
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP1]]
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP2]]
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP3]]
-; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]]
+; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP4]]
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP5]]
; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP6]]
; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP7]]
-; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]]
+; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP8]]
; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP9]]
; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP10]]
-; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]]
-; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]]
-; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]]
-; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]]
+; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP11]]
+; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP12]]
+; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP13]]
+; CHECK-NEXT: [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP14]]
; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[M]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 4
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 4
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 4
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i64 4
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 4
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 4
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i64 4
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i64 4
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i64 4
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i64 4
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[NEXT_GEP11]], i64 4
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[NEXT_GEP12]], i64 4
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[NEXT_GEP13]], i64 4
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[NEXT_GEP14]], i64 4
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[NEXT_GEP15]], i64 4
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[NEXT_GEP16]], i64 4
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[TMP16]], i32 -4
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP20]], i32 -4
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP24]], i32 -4
-; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -4
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4
+; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP29]], i32 -4
+; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP30]], i32 -4
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -85,7 +74,7 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
; CHECK-NEXT: [[TMP38:%.*]] = add <4 x i32> [[STRIDED_VEC23]], [[STRIDED_VEC22]]
; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[STRIDED_VEC26]], [[STRIDED_VEC25]]
; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0
-; CHECK-NEXT: store i32 [[TMP40]], ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT: store i32 [[TMP40]], ptr [[NEXT_GEP12]], align 4
; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1
; CHECK-NEXT: store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4
; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2
@@ -93,7 +82,7 @@ define ptr @test_in...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/142433
More information about the llvm-commits
mailing list