[llvm] [VPlan] Implement interleaving as VPlan-to-VPlan transform. (PR #95842)

via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 23 03:15:27 PDT 2024


================
@@ -1572,3 +1572,421 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
     }
   }
 }
+
+static VPValue *getInterleavedValue(
+    DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues, VPValue *V,
+    unsigned IC) {
+  if (IC == 0)
+    return V;
+  if (V->isLiveIn())
+    return V;
+  return InterleavedValues[V][IC - 1];
+}
+
+static void interleaveReplicateRegion(
+    VPRegionBlock *VPR, VPlan &Plan, unsigned IC,
+    DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues) {
+  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+  VPBlockBase *InsertPt = VPR;
+  for (unsigned I = 1; I != IC; ++I) {
+    auto *Copy = VPR->clone();
+    VPBlockUtils::insertBlockAfter(Copy, InsertPt);
+    InsertPt = Copy;
+
+    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+        RPOT(Copy->getEntry());
+    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+        RPOT2(VPR->getEntry());
+    for (const auto &[New, Old] :
+         zip(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT),
+             VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT2))) {
+      if (New->getParent() != Copy)
+        break;
+      for (const auto &[CopyR, OrigR] : zip(*New, *Old)) {
+        for (unsigned Idx = 0; Idx != CopyR.getNumOperands(); ++Idx) {
+          CopyR.setOperand(Idx, getInterleavedValue(InterleavedValues,
+                                                    CopyR.getOperand(Idx), I));
+        }
+        if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&CopyR)) {
+          ScalarIVSteps->addOperand(
+              Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+        }
+
+        unsigned Idx = 0;
+        for (VPValue *Res : OrigR.definedValues()) {
+          auto Ins = InterleavedValues.insert({Res, {}});
+          Ins.first->second.push_back(CopyR.getVPValue(Idx));
+          Idx++;
+        }
+      }
+    }
+  }
+}
+
+static void interleaveHeaderPHI(
+    VPRecipeBase &R, VPlan &Plan, unsigned IC,
+    VPBasicBlock::iterator &InsertPtForPhi,
+    DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues,
+    VPTypeAnalysis &TypeInfo, SmallPtrSet<VPRecipeBase *, 8> &ToSkip,
+    SmallVector<SmallVector<VPHeaderPHIRecipe *>> &PhisToRemap) {
+  if (isa<VPFirstOrderRecurrencePHIRecipe>(&R))
+    return;
+
+  // Generate step vectors for each unrolled part.
+  if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+    VPBasicBlock *PH =
+        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
+    VPValue *Step = &Plan.getVF();
+    Type *IVTy = TypeInfo.inferScalarType(IV);
+    auto &ID = IV->getInductionDescriptor();
+    FastMathFlags FMFs;
+    if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
+      FMFs = ID.getInductionBinOp()->getFastMathFlags();
+
+    if (TypeInfo.inferScalarType(Step) != IVTy) {
+      Instruction::CastOps CastOp;
+      if (IVTy->isFloatingPointTy())
+        CastOp = Instruction::UIToFP;
+      else
+        CastOp = Instruction::Trunc;
+      Step = new VPWidenCastRecipe(CastOp, Step, IV->getScalarType());
+      PH->appendRecipe(Step->getDefiningRecipe());
+      ToSkip.insert(Step->getDefiningRecipe());
+    }
+
+    auto *ConstScale =
+        IV->getOperand(1)->isLiveIn()
+            ? dyn_cast<ConstantInt>(IV->getOperand(1)->getLiveInIRValue())
+            : nullptr;
+    if (!ConstScale || ConstScale->getZExtValue() != 1) {
+      VPValue *Scale = IV->getOperand(1);
+      if (TypeInfo.inferScalarType(Scale) != IVTy) {
+        Scale = new VPWidenCastRecipe(Instruction::Trunc, Scale,
+                                      IV->getScalarType());
+        PH->appendRecipe(Scale->getDefiningRecipe());
+        ToSkip.insert(Scale->getDefiningRecipe());
+      }
+
+      VPBuilder Builder(PH);
+      VPInstruction *Mul;
+      if (IVTy->isFloatingPointTy())
+        Mul = Builder.createFPOp(Instruction::FMul, {Step, Scale},
+                                 R.getDebugLoc(), "", FMFs);
+      else
+        Mul = Builder.createNaryOp(Instruction::Mul, {Step, Scale},
+                                   R.getDebugLoc());
+      Step = Mul;
+      ToSkip.insert(Mul);
+    }
+    R.addOperand(Step);
+
+    for (unsigned I = 1; I != IC; ++I) {
+      VPBuilder Builder;
+      Builder.setInsertPoint(R.getParent(), InsertPtForPhi);
+      auto Ins = InterleavedValues.insert({IV, {}});
+      VPValue *Prev = getInterleavedValue(InterleavedValues, IV, I - 1);
+      VPInstruction *Add;
+      std::string Name = I > 1 ? "step.add." + std::to_string(I) : "step.add";
+
+      if (IVTy->isFloatingPointTy())
+        Add = Builder.createFPOp(ID.getInductionOpcode(),
+                                 {
+                                     Prev,
+                                     Step,
+                                 },
+                                 R.getDebugLoc(), Name, FMFs);
+      else
+        Add = Builder.createNaryOp(Instruction::Add,
+                                   {
+                                       Prev,
+                                       Step,
+                                   },
+                                   R.getDebugLoc(), Name);
+      ToSkip.insert(Add);
+      Ins.first->second.push_back(Add);
+      InsertPtForPhi = std::next(Add->getIterator());
+    }
+    R.addOperand(getInterleavedValue(InterleavedValues, IV, IC - 1));
+    return;
+  }
+
+  VPRecipeBase *InsertPt = &R;
+  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+  for (unsigned I = 1; I != IC; ++I) {
+    VPRecipeBase *Copy = R.clone();
+    Copy->insertAfter(InsertPt);
+    InsertPt = Copy;
+    unsigned Idx = 0;
+    for (VPValue *Res : R.definedValues()) {
+      auto Ins = InterleavedValues.insert({Res, {}});
+      Ins.first->second.push_back(Copy->getVPValue(Idx));
+      Idx++;
+    }
+    if (isa<VPWidenPointerInductionRecipe>(&R)) {
+      if (I == 1)
+        R.addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
+      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
+      Copy->addOperand(R.getVPSingleValue());
+      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+      continue;
+    }
+
+    if (auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+      if (RdxPhi->isOrdered()) {
+        Copy->eraseFromParent();
+        break;
+      }
+      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+    }
+
+    if (I == 1)
+      PhisToRemap.emplace_back();
+
+    auto *H = cast<VPHeaderPHIRecipe>(Copy);
+    PhisToRemap.back().push_back(H);
+  }
+}
+
+static void
+interleaveRecipe(VPRecipeBase &R, VPlan &Plan, unsigned IC,
+                 DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues,
+                 VPTypeAnalysis &TypeInfo) {
+  using namespace llvm::VPlanPatternMatch;
+  VPValue *Op1;
+  if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
+                    m_VPValue(), m_VPValue(Op1)))) {
+    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
+    for (unsigned I = 1; I != IC; ++I) {
+      R.addOperand(getInterleavedValue(InterleavedValues, Op1, I));
+      Ins.first->second.push_back(R.getVPSingleValue());
+    }
+    return;
+  }
+  VPValue *Op0;
+  if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(m_VPValue(Op0),
+                                                               m_VPValue()))) {
+    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
+    for (unsigned I = 1; I != IC; ++I) {
+      Ins.first->second.push_back(R.getVPSingleValue());
+    }
+
+    bool ScalarVFOnly = Plan.hasScalarVFOnly();
+    if (!ScalarVFOnly) {
+      R.setOperand(0, getInterleavedValue(InterleavedValues, Op0, IC - 1));
+      return;
+    }
+  }
+
+  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+  if (isa<VPInstruction>(&R) && cast<VPInstruction>(&R)->getOpcode() ==
+                                    VPInstruction::CalculateTripCountMinusVF) {
+    R.addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, IC)));
+    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
+    for (unsigned I = 1; I != IC; ++I) {
+      Ins.first->second.push_back(R.getVPSingleValue());
+    }
+
+    return;
+  }
+
+  if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
+    if (VPI->getOpcode() == VPInstruction::BranchOnCount ||
+        VPI->getOpcode() == VPInstruction::BranchOnCond)
+      return;
+  }
+
+  if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
+    if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
+        RepR->getOperand(1)->isDefinedOutsideVectorRegions()) {
+      R.setOperand(
+          0, getInterleavedValue(InterleavedValues, R.getOperand(0), IC - 1));
+      return;
+    }
+    if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
+      if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
+        auto Ins = InterleavedValues.insert({RepR, {}});
+        Ins.first->second.push_back(RepR);
+        return;
+      }
+    }
+  }
+
+  // TODO: Generalize for any uniform recipe.
+  if (auto *Cast = dyn_cast<VPWidenCastRecipe>(&R)) {
+    if (Cast->getOperand(0)->isLiveIn()) {
+      auto Ins = InterleavedValues.insert({Cast, {}});
+      Ins.first->second.push_back(Cast);
+      return;
+    }
+  }
+
+  if (isa<VPInstruction>(&R) &&
+      vputils::onlyFirstPartUsed(R.getVPSingleValue())) {
+    auto Ins = InterleavedValues.insert({R.getVPSingleValue(), {}});
+    for (unsigned I = 1; I != IC; ++I) {
+      Ins.first->second.push_back(R.getVPSingleValue());
+    }
+
+    return;
+  }
+
+  VPRecipeBase *InsertPt = &R;
+  for (unsigned I = 1; I != IC; ++I) {
+    VPRecipeBase *Copy = R.clone();
+    Copy->insertAfter(InsertPt);
+    InsertPt = Copy;
+    unsigned Idx = 0;
+    for (VPValue *Res : R.definedValues()) {
+      auto Ins = InterleavedValues.insert({Res, {}});
+      Ins.first->second.push_back(Copy->getVPValue(Idx));
+      Idx++;
+    }
+
+    if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
+      if (VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
+        Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+      }
+      if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) {
+        Copy->setOperand(
+            0, getInterleavedValue(InterleavedValues, R.getOperand(1), I - 1));
+        Copy->setOperand(
+            1, getInterleavedValue(InterleavedValues, R.getOperand(1), I));
+        continue;
+      }
+    }
+    if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
+      auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
+      if (Phi->isOrdered()) {
+        auto Ins = InterleavedValues.insert({Phi, {}});
+        if (I == 1) {
+          Ins.first->second.clear();
+          Ins.first->second.push_back(Red);
+        }
+        Ins.first->second.push_back(Copy->getVPSingleValue());
+        Phi->setOperand(1, Copy->getVPSingleValue());
+      }
+    }
+    for (unsigned Idx = 0; Idx != Copy->getNumOperands(); ++Idx)
+      Copy->setOperand(Idx, getInterleavedValue(InterleavedValues,
+                                                Copy->getOperand(Idx), I));
+
+    // Add operand indicating the part to generate code for to recipes still
+    // requiring it.
+    if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
+            VPVectorPointerRecipe>(Copy))
+      Copy->addOperand(Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, I)));
+
+    if (isa<VPVectorPointerRecipe>(R))
+      Copy->setOperand(0, R.getOperand(0));
+  }
+}
+
+static void
+interleaveBlock(VPBlockBase *VPB, VPlan &Plan, unsigned IC,
+                DenseMap<VPValue *, SmallVector<VPValue *>> &InterleavedValues,
+                VPTypeAnalysis &TypeInfo,
+                SmallPtrSet<VPRecipeBase *, 8> &ToSkip,
+                SmallVector<SmallVector<VPHeaderPHIRecipe *>> &PhisToRemap) {
+  auto *VPR = dyn_cast<VPRegionBlock>(VPB);
+  if (VPR) {
+    if (VPR->isReplicator())
+      interleaveReplicateRegion(VPR, Plan, IC, InterleavedValues);
+    else {
+      ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+          RPOT(VPR->getEntry());
+      for (VPBlockBase *VPB : RPOT) {
+        interleaveBlock(VPB, Plan, IC, InterleavedValues, TypeInfo, ToSkip,
+                        PhisToRemap);
+      }
+    }
+    return;
+  }
+
+  auto *VPBB = cast<VPBasicBlock>(VPB);
+  auto InsertPtForPhi = VPBB->getFirstNonPhi();
+  for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+    if (ToSkip.contains(&R))
+      continue;
+
+    auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
+    if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
+      for (unsigned I = 1; I != IC; ++I) {
+        auto Ins = InterleavedValues.insert({SingleDef, {}});
+        Ins.first->second.push_back(SingleDef);
+      }
+      continue;
+    }
+
+    if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
+      interleaveHeaderPHI(R, Plan, IC, InsertPtForPhi, InterleavedValues,
+                          TypeInfo, ToSkip, PhisToRemap);
+      continue;
+    }
+
+    interleaveRecipe(R, Plan, IC, InterleavedValues, TypeInfo);
+  }
+}
+
+void VPlanTransforms::interleave(VPlan &Plan, unsigned IC, LLVMContext &Ctx) {
+  assert(IC > 0);
+  if (IC == 1)
+    return;
+  DenseMap<VPValue *, SmallVector<VPValue *>> InterleavedValues;
+
+  SmallPtrSet<VPRecipeBase *, 8> ToSkip;
+
+  Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+  VPTypeAnalysis TypeInfo(CanIVIntTy, Ctx);
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  SmallVector<SmallVector<VPHeaderPHIRecipe *>> PhisToRemap;
+  interleaveBlock(Plan.getPreheader(), Plan, IC, InterleavedValues, TypeInfo,
+                  ToSkip, PhisToRemap);
----------------
ayalz wrote:

Does the Entry/pre-preheader need to be interleaveBlock'ed? Does it need PhisToRemap?

https://github.com/llvm/llvm-project/pull/95842


More information about the llvm-commits mailing list