[llvm] [LoopVectorize][AArch64][SVE] Generate wide active lane masks (PR #81140)

Thu Jun 6 02:55:03 PDT 2024

================
@@ -713,8 +692,94 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
     DL.print(O);
   }
 }
+
+void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+                                   VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+
+  printAsOperand(O, SlotTracker);
+  O << " = active lane mask";
+  printFlags(O);
+  printOperands(O, SlotTracker);
+
+  if (auto DL = getDebugLoc()) {
+    O << ", !dbg ";
+    DL.print(O);
+  }
+}
+
 #endif
 
+void VPActiveLaneMaskRecipe::execute(VPTransformState &State) {
+  assert(!State.Instance && "VPInstruction executing an Instance");
+
+  IRBuilderBase &Builder = State.Builder;
+  Builder.SetCurrentDebugLocation(getDebugLoc());
+
+  // If this the active lane mask is scalar, generate the CMP directly
+  // to avoid unnecessary extracts.
+  if (State.VF.isScalar()) {
+    for (int Part = State.UF - 1; Part >= 0; --Part) {
+      // Get first lane of vector induction variable.
+      Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+      // Get the original loop tripcount.
+      Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
+
+      Value *V = Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0,
+                                   ScalarTC, Name);
+      State.set(this, V, Part);
+    }
+    return;
+  }
+
+  auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+  auto *PredTy = VectorType::get(Int1Ty, State.VF);
+
+  unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(),
----------------
fhahn wrote:

Conceptually the decision whether to widen the active lane mask or not shouldn't be taken at codegen (::execute), but instead performed as transform (or possibly on construction, if it is simple to determine). This makes both codegen and cost-modeling based on the VPlan easier, as well makes things more explicit in the representation itself. 

If this depends on the concrete chosen VF/UF, it can be transformed late in the pipeline (like `optimizeForVFAndUF`).

https://github.com/llvm/llvm-project/pull/81140