[libcxx-commits] [flang] [lld] [libc] [clang-tools-extra] [libclc] [clang] [libcxx] [libcxxabi] [llvm] [lldb] [compiler-rt] [VPlan] Add new VPUniformPerUFRecipe, use for step truncation. (PR #78113)

Thu Jan 25 01:48:55 PST 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/78113

>From 36b085f21b76d7bf7c9965a86a09d1cef4fe9329 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 14 Jan 2024 14:13:08 +0000
Subject: [PATCH 1/5] [VPlan] Add new VPUniformPerUFRecipe, use for step
 truncation.

Add a new recipe to model uniform-per-UF instructions, without relying
on an underlying instruction. Initially, it supports uniform cast-ops
and is therefore storing the result type.

Not relying on an underlying instruction (like the current
VPReplicateRecipe) allows to create instances without a corresponding
instruction.

In the future, to plan is to extend this recipe to handle all opcodes
needed to replace the uniform part of VPReplicateRecipe.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 30 ++++++++++++
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  6 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 49 ++++++++++++++++---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  9 ++++
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |  1 +
 .../LoopVectorize/cast-induction.ll           |  4 +-
 .../interleave-and-scalarize-only.ll          |  3 +-
 .../pr46525-expander-insertpoint.ll           |  2 +-
 8 files changed, 93 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4b4f4911eb6415..d5985224cccc48 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1945,6 +1945,36 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPValue {
   }
 };
 
+/// VPUniformPerUFRecipe represents an instruction with Opcode that is uniform
+/// per UF, i.e. it generates a single scalar instance per UF.
+/// TODO: at the moment, only Cast opcodes are supported, extend to support
+///       missing opcodes to replace uniform part of VPReplicateRecipe.
+class VPUniformPerUFRecipe : public VPRecipeBase, public VPValue {
+  unsigned Opcode;
+
+  /// Result type for the cast.
+  Type *ResultTy;
+
+  Value *generate(VPTransformState &State, unsigned Part);
+
+public:
+  VPUniformPerUFRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
+      : VPRecipeBase(VPDef::VPUniformPerUFSC, {Op}), VPValue(this),
+        Opcode(Opcode), ResultTy(ResultTy) {}
+
+  ~VPUniformPerUFRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for generating conditional branches on the bits of a mask.
 class VPBranchOnMaskRecipe : public VPRecipeBase {
 public:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 97a8a1803bbf5a..d71b0703994450 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -230,7 +230,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
             return V->getUnderlyingValue()->getType();
           })
           .Case<VPWidenCastRecipe>(
-              [](const VPWidenCastRecipe *R) { return R->getResultType(); });
+              [](const VPWidenCastRecipe *R) { return R->getResultType(); })
+          .Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
+            return R->getSCEV()->getType();
+          });
+
   assert(ResultTy && "could not infer type for the given VPValue");
   CachedTypes[V] = ResultTy;
   return ResultTy;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1f844bce23102e..423504e8f7e05e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -164,6 +164,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
     auto *R = cast<VPReplicateRecipe>(this);
     return R->getUnderlyingInstr()->mayHaveSideEffects();
   }
+  case VPUniformPerUFSC:
+    return false;
   default:
     return true;
   }
@@ -1117,13 +1119,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
 
   // Ensure step has the same type as that of scalar IV.
   Type *BaseIVTy = BaseIV->getType()->getScalarType();
-  if (BaseIVTy != Step->getType()) {
-    // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
-    // avoid separate truncate here.
-    assert(Step->getType()->isIntegerTy() &&
-           "Truncation requires an integer step");
-    Step = State.Builder.CreateTrunc(Step, BaseIVTy);
-  }
+  assert(BaseIVTy == Step->getType());
 
   // We build scalar steps for both integer and floating-point induction
   // variables. Here, we determine the kind of arithmetic we will perform.
@@ -1469,6 +1465,45 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+Value *VPUniformPerUFRecipe ::generate(VPTransformState &State, unsigned Part) {
+  switch (Opcode) {
+  case Instruction::SExt:
+  case Instruction::ZExt:
+  case Instruction::Trunc: {
+    Value *Op = State.get(getOperand(0), VPIteration(Part, 0));
+    return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);
+  }
+  default:
+    llvm_unreachable("opcode not implemented yet");
+  }
+}
+
+void VPUniformPerUFRecipe ::execute(VPTransformState &State) {
+  bool UniformAcrossUFs = all_of(operands(), [](VPValue *Op) {
+    return Op->isDefinedOutsideVectorRegions();
+  });
+  for (unsigned Part = 0; Part != State.UF; ++Part) {
+    Value *Res;
+    // Only generate a single instance, if the recipe is uniform across all UFs.
+    if (Part > 0 && UniformAcrossUFs)
+      Res = State.get(this, VPIteration(0, 0));
+    else
+      Res = generate(State, Part);
+    State.set(this, Res, VPIteration(Part, 0));
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPUniformPerUFRecipe ::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "UNIFORM-PER-UF ";
+  printAsOperand(O, SlotTracker);
+  O << " = " << Instruction::getOpcodeName(Opcode) << " ";
+  printOperands(O, SlotTracker);
+  O << " to " << *ResultTy;
+}
+#endif
+
 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   assert(State.Instance && "Branch on Mask works only on single instance.");
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b3694e74a38509..6ba8901e76aa50 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -505,6 +505,15 @@ static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
     HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);
   }
 
+  VPTypeAnalysis TypeInfo(SE.getContext());
+  if (TypeInfo.inferScalarType(BaseIV) != TypeInfo.inferScalarType(Step)) {
+    Step = new VPUniformPerUFRecipe(Instruction::Trunc, Step,
+                                    TypeInfo.inferScalarType(BaseIV));
+    auto *VecPreheader =
+        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
+    VecPreheader->appendRecipe(Step->getDefiningRecipe());
+  }
+
   VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);
   HeaderVPBB->insert(Steps, IP);
   return Steps;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 8cc98f4abf933e..009edea39a3c43 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -362,6 +362,7 @@ class VPDef {
     // START: Phi-like recipes. Need to be kept together.
     VPBlendSC,
     VPPredInstPHISC,
+    VPUniformPerUFSC,
     // START: SubclassID for recipes that inherit VPHeaderPHIRecipe.
     // VPHeaderPHIRecipe need to be kept together.
     VPCanonicalIVPHISC,
diff --git a/llvm/test/Transforms/LoopVectorize/cast-induction.ll b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
index c5edf9831d7d90..4121a1399c47f5 100644
--- a/llvm/test/Transforms/LoopVectorize/cast-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
@@ -83,12 +83,14 @@ define void @cast_variable_step(i64 %step) {
 ; VF4: middle.block:
 ;
 ; IC2-LABEL: @cast_variable_step(
+; IC2:   [[TRUNC_STEP:%.+]] = trunc i64 %step to i32
+; IC2:   br label %vector.body
+
 ; IC2-LABEL: vector.body:
 ; IC2-NEXT:   [[INDEX:%.+]] = phi i64 [ 0, %vector.ph ]
 ; IC2-NEXT:   [[MUL:%.+]] = mul i64 %index, %step
 ; IC2-NEXT:   [[OFFSET_IDX:%.+]] = add i64 10, [[MUL]]
 ; IC2-NEXT:   [[TRUNC_OFF:%.+]] = trunc i64 [[OFFSET_IDX]] to i32
-; IC2-NEXT:   [[TRUNC_STEP:%.+]] = trunc i64 %step to i32
 ; IC2-NEXT:   [[STEP0:%.+]] = mul i32 0, [[TRUNC_STEP]]
 ; IC2-NEXT:   [[T0:%.+]] = add i32 [[TRUNC_OFF]], [[STEP0]]
 ; IC2-NEXT:   [[STEP1:%.+]] = mul i32 1, [[TRUNC_STEP]]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 297cd2a7c12f9a..6410a556589f94 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -184,6 +184,7 @@ exit:
 ; DBG-NEXT: No successors
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
+; DBG-NEXT:   UNIFORM-PER-UF vp<[[CAST:%.+]]> = trunc ir<1> to i32
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT: <x1> vector loop: {
@@ -191,7 +192,7 @@ exit:
 ; DBG-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; DBG-NEXT:     FIRST-ORDER-RECURRENCE-PHI ir<%for> = phi ir<0>, vp<[[SCALAR_STEPS:.+]]>
 ; DBG-NEXT:     vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * ir<1> (truncated to i32)
-; DBG-NEXT:     vp<[[SCALAR_STEPS]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<1>
+; DBG-NEXT:     vp<[[SCALAR_STEPS]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, vp<[[CAST]]>
 ; DBG-NEXT:     EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%for>, vp<[[SCALAR_STEPS]]>
 ; DBG-NEXT:     CLONE store vp<[[SPLICE]]>, ir<%dst>
 ; DBG-NEXT:     EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
diff --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
index ea3de4a0fbb363..f0220f5e766b23 100644
--- a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
@@ -43,7 +43,7 @@ define void @test(i16 %x, i64 %y, ptr %ptr) {
 ; CHECK-NEXT:    [[V3:%.*]] = add i8 [[V2]], 1
 ; CHECK-NEXT:    [[CMP15:%.*]] = icmp slt i8 [[V3]], 5
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[INC]]
-; CHECK-NEXT:    br i1 [[CMP15]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP15]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop.exit:
 ; CHECK-NEXT:    [[DIV_1:%.*]] = udiv i64 [[Y]], [[ADD]]
 ; CHECK-NEXT:    [[V1:%.*]] = add i64 [[DIV_1]], 1

>From 6b3e52eebb0bc89e802c6d83afc2b2f79e5123a9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 21 Jan 2024 21:05:36 +0000
Subject: [PATCH 2/5] !fixup specialize to VPScalarCastRecipe for now.

---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 59 +++++++++----------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 14 +++--
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  4 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |  2 +-
 .../interleave-and-scalarize-only.ll          |  2 +-
 5 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ebdc4678853894..59eccf135dbe43 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -859,6 +859,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenIntOrFpInductionSC:
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
+    case VPRecipeBase::VPScalarCastSC:
       return true;
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPBranchOnMaskSC:
@@ -1338,6 +1339,34 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
   Type *getResultType() const { return ResultTy; }
 };
 
+/// VPScalarCastRecipe is a recipe o create scalar cast instructions.
+class VPScalarCastRecipe : public VPRecipeBase, public VPValue {
+  /// Cast instruction opcode.
+  Instruction::CastOps Opcode;
+
+  /// Result type for the cast.
+  Type *ResultTy;
+
+  Value *generate(VPTransformState &State, unsigned Part);
+
+public:
+  VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
+      : VPRecipeBase(VPDef::VPScalarCastSC, {Op}), VPValue(this),
+        Opcode(Opcode), ResultTy(ResultTy) {}
+
+  ~VPScalarCastRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for widening Call instructions.
 class VPWidenCallRecipe : public VPSingleDefRecipe {
   /// ID of the vector intrinsic to call when widening the call. If set the
@@ -2010,36 +2039,6 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags {
   }
 };
 
-/// VPUniformPerUFRecipe represents an instruction with Opcode that is uniform
-/// per UF, i.e. it generates a single scalar instance per UF.
-/// TODO: at the moment, only Cast opcodes are supported, extend to support
-///       missing opcodes to replace uniform part of VPReplicateRecipe.
-class VPUniformPerUFRecipe : public VPRecipeBase, public VPValue {
-  unsigned Opcode;
-
-  /// Result type for the cast.
-  Type *ResultTy;
-
-  Value *generate(VPTransformState &State, unsigned Part);
-
-public:
-  VPUniformPerUFRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
-      : VPRecipeBase(VPDef::VPUniformPerUFSC, {Op}), VPValue(this),
-        Opcode(Opcode), ResultTy(ResultTy) {}
-
-  ~VPUniformPerUFRecipe() override = default;
-
-  VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
-
-  void execute(VPTransformState &State) override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-};
-
 /// A recipe for generating conditional branches on the bits of a mask.
 class VPBranchOnMaskRecipe : public VPRecipeBase {
 public:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f6dd13c6375cfd..fe93bae09f0d42 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -164,7 +164,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
     auto *R = cast<VPReplicateRecipe>(this);
     return R->getUnderlyingInstr()->mayHaveSideEffects();
   }
-  case VPUniformPerUFSC:
+  case VPScalarCastSC:
     return false;
   default:
     return true;
@@ -1465,7 +1465,9 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-Value *VPUniformPerUFRecipe ::generate(VPTransformState &State, unsigned Part) {
+Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
+  assert(vputils::onlyFirstLaneUsed(this) &&
+         "Codegen only implemented for first lane only.");
   switch (Opcode) {
   case Instruction::SExt:
   case Instruction::ZExt:
@@ -1478,7 +1480,7 @@ Value *VPUniformPerUFRecipe ::generate(VPTransformState &State, unsigned Part) {
   }
 }
 
-void VPUniformPerUFRecipe ::execute(VPTransformState &State) {
+void VPScalarCastRecipe ::execute(VPTransformState &State) {
   bool UniformAcrossUFs = all_of(operands(), [](VPValue *Op) {
     return Op->isDefinedOutsideVectorRegions();
   });
@@ -1494,9 +1496,9 @@ void VPUniformPerUFRecipe ::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPUniformPerUFRecipe ::print(raw_ostream &O, const Twine &Indent,
-                                  VPSlotTracker &SlotTracker) const {
-  O << Indent << "UNIFORM-PER-UF ";
+void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
+  O << Indent << "SCALAR-CAST ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
   printOperands(O, SlotTracker);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c73058a24eb155..c485eadf9e0f36 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -506,8 +506,8 @@ static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
 
   VPTypeAnalysis TypeInfo(SE.getContext());
   if (TypeInfo.inferScalarType(BaseIV) != TypeInfo.inferScalarType(Step)) {
-    Step = new VPUniformPerUFRecipe(Instruction::Trunc, Step,
-                                    TypeInfo.inferScalarType(BaseIV));
+    Step = new VPScalarCastRecipe(Instruction::Trunc, Step,
+                                  TypeInfo.inferScalarType(BaseIV));
     auto *VecPreheader =
         cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
     VecPreheader->appendRecipe(Step->getDefiningRecipe());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 009edea39a3c43..bbbf2d3a965dbf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -362,7 +362,7 @@ class VPDef {
     // START: Phi-like recipes. Need to be kept together.
     VPBlendSC,
     VPPredInstPHISC,
-    VPUniformPerUFSC,
+    VPScalarCastSC,
     // START: SubclassID for recipes that inherit VPHeaderPHIRecipe.
     // VPHeaderPHIRecipe need to be kept together.
     VPCanonicalIVPHISC,
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 6410a556589f94..c3bf2eee1dfc88 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -184,7 +184,7 @@ exit:
 ; DBG-NEXT: No successors
 ; DBG-EMPTY:
 ; DBG-NEXT: vector.ph:
-; DBG-NEXT:   UNIFORM-PER-UF vp<[[CAST:%.+]]> = trunc ir<1> to i32
+; DBG-NEXT:   SCALAR-CAST vp<[[CAST:%.+]]> = trunc ir<1> to i32
 ; DBG-NEXT: Successor(s): vector loop
 ; DBG-EMPTY:
 ; DBG-NEXT: <x1> vector loop: {

>From 9331a454be3ca943244ddd02c934192eda98ec39 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 22 Jan 2024 12:49:52 +0000
Subject: [PATCH 3/5] !fixup address latest comments, thanks!

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp |  1 +
 llvm/lib/Transforms/Vectorize/VPlan.h           | 10 ++++------
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp  |  7 +++----
 llvm/lib/Transforms/Vectorize/VPlanValue.h      |  2 +-
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ca93e15719fb2..7d1708b36a8786 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2297,6 +2297,7 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
                            ? B.CreateSExtOrTrunc(Index, StepTy)
                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
   if (CastedIndex != Index) {
+    assert(!isa<SExtInst>(CastedIndex));
     CastedIndex->setName(CastedIndex->getName() + ".cast");
     Index = CastedIndex;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 59eccf135dbe43..4350b6f81e5580 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1339,9 +1339,8 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
   Type *getResultType() const { return ResultTy; }
 };
 
-/// VPScalarCastRecipe is a recipe o create scalar cast instructions.
-class VPScalarCastRecipe : public VPRecipeBase, public VPValue {
-  /// Cast instruction opcode.
+/// VPScalarCastRecipe is a recipe to create scalar cast instructions.
+class VPScalarCastRecipe : public VPSingleDefRecipe {
   Instruction::CastOps Opcode;
 
   /// Result type for the cast.
@@ -1351,17 +1350,16 @@ class VPScalarCastRecipe : public VPRecipeBase, public VPValue {
 
 public:
   VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
-      : VPRecipeBase(VPDef::VPScalarCastSC, {Op}), VPValue(this),
+      : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}),
         Opcode(Opcode), ResultTy(ResultTy) {}
 
   ~VPScalarCastRecipe() override = default;
 
-  VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
+  VP_CLASSOF_IMPL(VPDef::VPScalarCastSC)
 
   void execute(VPTransformState &State) override;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 #endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fe93bae09f0d42..36c8f8e77e9353 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -117,6 +117,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   switch (getVPDefID()) {
   case VPDerivedIVSC:
   case VPPredInstPHISC:
+  case VPScalarCastSC:
     return false;
   case VPInstructionSC:
     switch (cast<VPInstruction>(this)->getOpcode()) {
@@ -164,8 +165,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
     auto *R = cast<VPReplicateRecipe>(this);
     return R->getUnderlyingInstr()->mayHaveSideEffects();
   }
-  case VPScalarCastSC:
-    return false;
   default:
     return true;
   }
@@ -1119,7 +1118,7 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
 
   // Ensure step has the same type as that of scalar IV.
   Type *BaseIVTy = BaseIV->getType()->getScalarType();
-  assert(BaseIVTy == Step->getType());
+  assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
 
   // We build scalar steps for both integer and floating-point induction
   // variables. Here, we determine the kind of arithmetic we will perform.
@@ -1467,7 +1466,7 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
 
 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
   assert(vputils::onlyFirstLaneUsed(this) &&
-         "Codegen only implemented for first lane only.");
+         "Codegen only implemented for first lane.");
   switch (Opcode) {
   case Instruction::SExt:
   case Instruction::ZExt:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index bbbf2d3a965dbf..c85f7715feaa2a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -350,6 +350,7 @@ class VPDef {
     VPInterleaveSC,
     VPReductionSC,
     VPReplicateSC,
+    VPScalarCastSC,
     VPScalarIVStepsSC,
     VPVectorPointerSC,
     VPWidenCallSC,
@@ -362,7 +363,6 @@ class VPDef {
     // START: Phi-like recipes. Need to be kept together.
     VPBlendSC,
     VPPredInstPHISC,
-    VPScalarCastSC,
     // START: SubclassID for recipes that inherit VPHeaderPHIRecipe.
     // VPHeaderPHIRecipe need to be kept together.
     VPCanonicalIVPHISC,

>From 9988d78ee278fc6664c2f3c6073cddf88ca50755 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 22 Jan 2024 12:55:35 +0000
Subject: [PATCH 4/5] !fixup fix formatting

---
 llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4350b6f81e5580..6192dc09fe231d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1350,8 +1350,8 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
 
 public:
   VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
-      : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}),
-        Opcode(Opcode), ResultTy(ResultTy) {}
+      : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}), Opcode(Opcode),
+        ResultTy(ResultTy) {}
 
   ~VPScalarCastRecipe() override = default;
 

>From 5500bdbe8c3576ad2b9f3f17166c4457d94bcb74 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 22 Jan 2024 21:51:21 +0000
Subject: [PATCH 5/5] !fixup address comments, use to truncate
 VPDerivedIVRecipe.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 -----
 llvm/lib/Transforms/Vectorize/VPlan.h         | 16 ++++------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  2 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 18 +++++++-----
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 29 ++++++++++++++-----
 5 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7d1708b36a8786..cf8e98cbc38a86 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2297,7 +2297,6 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
                            ? B.CreateSExtOrTrunc(Index, StepTy)
                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
   if (CastedIndex != Index) {
-    assert(!isa<SExtInst>(CastedIndex));
     CastedIndex->setName(CastedIndex->getName() + ".cast");
     Index = CastedIndex;
   }
@@ -9285,12 +9284,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
       Kind, cast_if_present<BinaryOperator>(FPBinOp));
   DerivedIV->setName("offset.idx");
-  if (TruncResultTy) {
-    assert(TruncResultTy != DerivedIV->getType() &&
-           Step->getType()->isIntegerTy() &&
-           "Truncation requires an integer step");
-    DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
-  }
   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
 
   State.set(this, DerivedIV, VPIteration(0, 0));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6192dc09fe231d..e6cbd81f062782 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1343,7 +1343,6 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
 class VPScalarCastRecipe : public VPSingleDefRecipe {
   Instruction::CastOps Opcode;
 
-  /// Result type for the cast.
   Type *ResultTy;
 
   Value *generate(VPTransformState &State, unsigned Part);
@@ -1363,6 +1362,9 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 #endif
+
+  /// Returns the result type of the cast.
+  Type *getResultType() const { return ResultTy; }
 };
 
 /// A recipe for widening Call instructions.
@@ -2347,10 +2349,6 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
 /// an IV with different start and step values, using Start + CanonicalIV *
 /// Step.
 class VPDerivedIVRecipe : public VPSingleDefRecipe {
-  /// If not nullptr, the result of the induction will get truncated to
-  /// TruncResultTy.
-  Type *TruncResultTy;
-
   /// Kind of the induction.
   const InductionDescriptor::InductionKind Kind;
   /// If not nullptr, the floating point induction binary operator. Must be set
@@ -2359,10 +2357,9 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
 
 public:
   VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
-                    VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
-                    Type *TruncResultTy)
+                    VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
       : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
-        TruncResultTy(TruncResultTy), Kind(IndDesc.getKind()),
+        Kind(IndDesc.getKind()),
         FPBinOp(dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())) {
   }
 
@@ -2381,8 +2378,7 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
 #endif
 
   Type *getScalarType() const {
-    return TruncResultTy ? TruncResultTy
-                         : getStartValue()->getLiveInIRValue()->getType();
+    return getStartValue()->getLiveInIRValue()->getType();
   }
 
   VPValue *getStartValue() const { return getOperand(0); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d71b0703994450..515dc41a55ea1b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -231,6 +231,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           })
           .Case<VPWidenCastRecipe>(
               [](const VPWidenCastRecipe *R) { return R->getResultType(); })
+          .Case<VPScalarCastRecipe>(
+              [](const VPScalarCastRecipe *R) { return R->getResultType(); })
           .Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
             return R->getSCEV()->getType();
           });
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 36c8f8e77e9353..afad4f068dd80f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1097,9 +1097,6 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
   getCanonicalIV()->printAsOperand(O, SlotTracker);
   O << " * ";
   getStepValue()->printAsOperand(O, SlotTracker);
-
-  if (TruncResultTy)
-    O << " (truncated to " << *TruncResultTy << ")";
 }
 #endif
 
@@ -1464,6 +1461,12 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
+  return C->isDefinedOutsideVectorRegions() ||
+         isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
+         isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));
+}
+
 Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
   assert(vputils::onlyFirstLaneUsed(this) &&
          "Codegen only implemented for first lane.");
@@ -1480,13 +1483,12 @@ Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
 }
 
 void VPScalarCastRecipe ::execute(VPTransformState &State) {
-  bool UniformAcrossUFs = all_of(operands(), [](VPValue *Op) {
-    return Op->isDefinedOutsideVectorRegions();
-  });
+  bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);
   for (unsigned Part = 0; Part != State.UF; ++Part) {
     Value *Res;
-    // Only generate a single instance, if the recipe is uniform across all UFs.
-    if (Part > 0 && UniformAcrossUFs)
+    // Only generate a single instance, if the recipe is uniform across UFs and
+    // VFs.
+    if (Part > 0 && IsUniformAcrossVFsAndUFs)
       Res = State.get(this, VPIteration(0, 0));
     else
       Res = generate(State, Part);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c485eadf9e0f36..49fc6431bb2b44 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -498,16 +498,31 @@ static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
   Type *TruncTy = TruncI ? TruncI->getType() : IVTy;
   VPValue *BaseIV = CanonicalIV;
+  VPTypeAnalysis TypeInfo(SE.getContext());
+  Type *StepTy = TypeInfo.inferScalarType(Step);
   if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) {
-    BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step,
-                                   TruncI ? TruncI->getType() : nullptr);
-    HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);
+    // If the induction needs transforming besides truncating, create a
+    // VPDerivedIVRecipe.
+    if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, IVTy)) {
+      BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step);
+      HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP);
+    }
+    if (TypeInfo.inferScalarType(BaseIV) != TruncTy) {
+      assert(TypeInfo.inferScalarType(BaseIV)->getScalarSizeInBits() >
+                 TruncTy->getScalarSizeInBits() &&
+             StepTy->isIntegerTy() && "Truncation requires an integer step");
+      auto *T = new VPScalarCastRecipe(Instruction::Trunc, BaseIV, TruncTy);
+      HeaderVPBB->insert(T, IP);
+      BaseIV = T;
+    }
   }
 
-  VPTypeAnalysis TypeInfo(SE.getContext());
-  if (TypeInfo.inferScalarType(BaseIV) != TypeInfo.inferScalarType(Step)) {
-    Step = new VPScalarCastRecipe(Instruction::Trunc, Step,
-                                  TypeInfo.inferScalarType(BaseIV));
+  Type *BaseIVTy = TypeInfo.inferScalarType(BaseIV);
+  if (BaseIVTy != StepTy) {
+    assert(StepTy->getScalarSizeInBits() > BaseIVTy->getScalarSizeInBits() &&
+           "Not truncating.");
+
+    Step = new VPScalarCastRecipe(Instruction::Trunc, Step, BaseIVTy);
     auto *VecPreheader =
         cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
     VecPreheader->appendRecipe(Step->getDefiningRecipe());