[llvm] 748ae52 - [IR][SVE] Add new llvm.experimental.stepvector intrinsic

Tue Mar 23 03:43:43 PDT 2021

Author: David Sherwood
Date: 2021-03-23T10:43:35Z
New Revision: 748ae5281d4f7f0ff261ba9e8c57e6b6fcfdb31e

URL: https://github.com/llvm/llvm-project/commit/748ae5281d4f7f0ff261ba9e8c57e6b6fcfdb31e
DIFF: https://github.com/llvm/llvm-project/commit/748ae5281d4f7f0ff261ba9e8c57e6b6fcfdb31e.diff

LOG: [IR][SVE] Add new llvm.experimental.stepvector intrinsic

This patch adds a new llvm.experimental.stepvector intrinsic,
which takes no arguments and returns a linear integer sequence of
values of the form <0, 1, ...>. It is primarily intended for
scalable vectors, although it will work for fixed width vectors
too. It is intended that later patches will make use of this
new intrinsic when vectorising induction variables, currently only
supported for fixed width. I've added a new CreateStepVector
method to the IRBuilder, which will generate a call to this
intrinsic for scalable vectors and fall back on creating a
ConstantVector for fixed width.

For scalable vectors this intrinsic is lowered to a new ISD node
called STEP_VECTOR, which takes a single constant integer argument
as the step. During lowering this argument is set to a value of 1.
The reason for this additional argument at the codegen level is
because in future patches we will introduce various generic DAG
combines such as

  mul step_vector(1), 2 -> step_vector(2)
  add step_vector(1), step_vector(1) -> step_vector(2)
  shl step_vector(1), 1 -> step_vector(2)
  etc.

that encourage a canonical format for all targets. This hopefully
means all other targets supporting scalable vectors can benefit
from this too.

I've added cost model tests for both fixed width and scalable
vectors:

  llvm/test/Analysis/CostModel/AArch64/neon-stepvector.ll
  llvm/test/Analysis/CostModel/AArch64/sve-stepvector.ll

as well as codegen lowering tests for fixed width and scalable
vectors:

  llvm/test/CodeGen/AArch64/neon-stepvector.ll
  llvm/test/CodeGen/AArch64/sve-stepvector.ll

See this thread for discussion of the intrinsic:
https://lists.llvm.org/pipermail/llvm-dev/2021-January/147943.html

Added: 
    llvm/test/Analysis/CostModel/AArch64/neon-stepvector.ll
    llvm/test/Analysis/CostModel/AArch64/sve-stepvector.ll
    llvm/test/CodeGen/AArch64/neon-stepvector.ll
    llvm/test/CodeGen/AArch64/sve-stepvector.ll
    llvm/test/Verifier/stepvector-intrinsic.ll

Modified: 
    llvm/docs/LangRef.rst
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/include/llvm/CodeGen/ISDOpcodes.h
    llvm/include/llvm/CodeGen/SelectionDAG.h
    llvm/include/llvm/IR/IRBuilder.h
    llvm/include/llvm/IR/Intrinsics.td
    llvm/include/llvm/Target/TargetSelectionDAG.td
    llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
    llvm/lib/IR/IRBuilder.cpp
    llvm/lib/IR/Verifier.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
    llvm/unittests/IR/IRBuilderTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 74d54440ee0e..e73018955196 100644

--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16673,6 +16673,36 @@ The first two operands are vectors with the same type. The third argument
 the source/result vector. The ``imm`` is a signed integer constant in the range
 ``-VL <= imm < VL``. For values outside of this range the result is poison.
 
+
+'``llvm.experimental.stepvector``' Intrinsic
+
+This is an overloaded intrinsic. You can use ``llvm.experimental.stepvector``
+to generate a vector whose lane values comprise the linear sequence
+<0, 1, 2, ...>. It is primarily intended for scalable vectors.
+
+::
+
+      declare <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+      declare <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+
+The '``llvm.experimental.stepvector``' intrinsics are used to create vectors
+of integers whose elements contain a linear sequence of values starting from 0
+with a step of 1.  This experimental intrinsic can only be used for vectors
+with integer elements that are at least 8 bits in size. If the sequence value
+exceeds the allowed limit for the element type then the result for that lane is
+undefined.
+
+These intrinsics work for both fixed and scalable vectors. While this intrinsic
+is marked as experimental, the recommended way to express this operation for
+fixed-width vectors is still to generate a constant vector instead.
+
+
+Arguments:
+""""""""""
+
+None.
+
+
 Matrix Intrinsics
 -----------------
 

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 9b043fe98b2d..a0fd0dadbc18 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1249,6 +1249,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
                                              VarMask, Alignment, CostKind, I);
     }
+    case Intrinsic::experimental_stepvector: {
+      if (isa<ScalableVectorType>(RetTy))
+        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+      // The cost of materialising a constant integer vector.
+      return TargetTransformInfo::TCC_Basic;
+    }
     case Intrinsic::experimental_vector_extract: {
       // FIXME: Handle case where a scalable vector is extracted from a scalable
       // vector

diff  --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 9d6c45f4e5c3..a6337260f0f4 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -592,6 +592,14 @@ enum NodeType {
   /// scalars should have the same type.
   SPLAT_VECTOR_PARTS,
 
+  /// STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised
+  /// of a linear sequence of unsigned values starting from 0 with a step of
+  /// IMM, where IMM must be a constant positive integer value. The operation
+  /// does not support returning fixed-width vectors or non-constant operands.
+  /// If the sequence value exceeds the limit allowed for the element type then
+  /// the values for those lanes are undefined.
+  STEP_VECTOR,
+
   /// MULHU/MULHS - Multiply high - Multiply two integers of type iN,
   /// producing an unsigned/signed value of type i[2*N], then return the top
   /// part.

diff  --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 57e976f9d265..7948bd3fbfdf 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -833,6 +833,10 @@ class SelectionDAG {
     return getNode(ISD::SPLAT_VECTOR, DL, VT, Op);
   }
 
+  /// Returns a vector of type ResVT whose elements contain the linear sequence
+  ///   <0, Step, Step * 2, Step * 3, ...>
+  SDValue getStepVector(const SDLoc &DL, EVT ResVT, SDValue Step);
+
   /// Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to
   /// the shuffle node in input but with swapped operands.
   ///

diff  --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index f2da98a98b70..45c34a387e45 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -854,6 +854,9 @@ class IRBuilderBase {
   /// will be the same type as that of \p Scaling.
   Value *CreateVScale(Constant *Scaling, const Twine &Name = "");
 
+  /// Creates a vector of type \p DstType with the linear sequence <0, 1, ...>
+  Value *CreateStepVector(Type *DstType, const Twine &Name = "");
+
   /// Create a call to intrinsic \p ID with 1 operand which is mangled on its
   /// type.
   CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1fc843352f7d..801da1fa8588 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1337,6 +1337,9 @@ def int_is_constant : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty],
 def int_ptrmask: DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>, llvm_anyint_ty],
                            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
+def int_experimental_stepvector : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                        [], [IntrNoMem]>;
+
 //===---------------- Vector Predication Intrinsics --------------===//
 
 // Speculatable Binary operators

diff  --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 7a96ed3fd93a..99066f675eea 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -664,6 +664,8 @@ def vector_reverse : SDNode<"ISD::VECTOR_REVERSE", SDTVecReverse>;
 def vector_splice : SDNode<"ISD::VECTOR_SPLICE", SDTVecSlice, []>;
 def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>;
 def splat_vector : SDNode<"ISD::SPLAT_VECTOR", SDTypeProfile<1, 1, []>, []>;
+def step_vector : SDNode<"ISD::STEP_VECTOR", SDTypeProfile<1, 1,
+                       [SDTCisVec<0>, SDTCisInt<1>]>, []>;
 def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
                               []>;
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index cf970566a9d2..88b48f247698 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -110,6 +110,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
                          Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
   case ISD::SPLAT_VECTOR:
                          Res = PromoteIntRes_SPLAT_VECTOR(N); break;
+  case ISD::STEP_VECTOR: Res = PromoteIntRes_STEP_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:
                          Res = PromoteIntRes_CONCAT_VECTORS(N); break;
 
@@ -4782,6 +4783,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SPLAT_VECTOR(SDNode *N) {
   return DAG.getNode(ISD::SPLAT_VECTOR, dl, NOutVT, Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_STEP_VECTOR(SDNode *N) {
+  SDLoc dl(N);
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "Type must be promoted to a vector type");
+  EVT NOutElemVT = TLI.getTypeToTransformTo(*DAG.getContext(),
+                                            NOutVT.getVectorElementType());
+  APInt StepVal = cast<ConstantSDNode>(N->getOperand(0))->getAPIntValue();
+  SDValue Step = DAG.getConstant(StepVal.getZExtValue(), dl, NOutElemVT);
+  return DAG.getStepVector(dl, NOutVT, Step);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
   SDLoc dl(N);
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index de916823463f..a40dd8887033 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -304,6 +304,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
   SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue PromoteIntRes_SPLAT_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_STEP_VECTOR(SDNode *N);
   SDValue PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N);
   SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N);
@@ -836,6 +837,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5843e7396818..190b4f32c4f8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -928,6 +928,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SCALAR_TO_VECTOR:
     SplitVecRes_ScalarOp(N, Lo, Hi);
     break;
+  case ISD::STEP_VECTOR:
+    SplitVecRes_STEP_VECTOR(N, Lo, Hi);
+    break;
   case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
@@ -1639,6 +1642,30 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
     Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
 }
 
+void DAGTypeLegalizer::SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  assert(N->getValueType(0).isScalableVector() &&
+         "Only scalable vectors are supported for STEP_VECTOR");
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  SDValue Step = N->getOperand(0);
+
+  Lo = DAG.getNode(ISD::STEP_VECTOR, dl, LoVT, Step);
+
+  // Hi = Lo + (EltCnt * Step)
+  EVT EltVT = Step.getValueType();
+  SDValue StartOfHi =
+      DAG.getVScale(dl, EltVT,
+                    cast<ConstantSDNode>(Step)->getAPIntValue() *
+                        LoVT.getVectorMinNumElements());
+  StartOfHi = DAG.getZExtOrTrunc(StartOfHi, dl, HiVT.getVectorElementType());
+  StartOfHi = DAG.getNode(ISD::SPLAT_VECTOR, dl, HiVT, StartOfHi);
+
+  Hi = DAG.getNode(ISD::STEP_VECTOR, dl, HiVT, Step);
+  Hi = DAG.getNode(ISD::ADD, dl, HiVT, Hi, StartOfHi);
+}
+
 void DAGTypeLegalizer::SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo,
                                             SDValue &Hi) {
   EVT LoVT, HiVT;

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f89c5571f82b..e5be4aeb8668 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1744,6 +1744,18 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
   return SDValue(CondCodeNodes[Cond], 0);
 }
 
+SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT, SDValue Step) {
+  if (ResVT.isScalableVector())
+    return getNode(ISD::STEP_VECTOR, DL, ResVT, Step);
+
+  EVT OpVT = Step.getValueType();
+  APInt StepVal = cast<ConstantSDNode>(Step)->getAPIntValue();
+  SmallVector<SDValue, 16> OpsStepConstants;
+  for (uint64_t i = 0; i < ResVT.getVectorNumElements(); i++)
+    OpsStepConstants.push_back(getConstant(StepVal * i, DL, OpVT));
+  return getBuildVector(ResVT, DL, OpsStepConstants);
+}
+
 /// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
 /// point at N1 to point at N2 and indices that point at N2 to point at N1.
 static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
@@ -4339,6 +4351,14 @@ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   return (computeKnownBits(A).Zero | computeKnownBits(B).Zero).isAllOnesValue();
 }
 
+static SDValue FoldSTEP_VECTOR(const SDLoc &DL, EVT VT, SDValue Step,
+                               SelectionDAG &DAG) {
+  if (cast<ConstantSDNode>(Step)->isNullValue())
+    return DAG.getConstant(0, DL, VT);
+
+  return SDValue();
+}
+
 static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
                                 ArrayRef<SDValue> Ops,
                                 SelectionDAG &DAG) {
@@ -4560,6 +4580,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                         APFloat::rmNearestTiesToEven, &Ignored);
       return getConstantFP(FPV, DL, VT);
     }
+    case ISD::STEP_VECTOR: {
+      if (SDValue V = FoldSTEP_VECTOR(DL, VT, Operand, *this))
+        return V;
+      break;
+    }
     }
   }
 
@@ -4669,6 +4694,18 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
   unsigned OpOpcode = Operand.getNode()->getOpcode();
   switch (Opcode) {
+  case ISD::STEP_VECTOR:
+    assert(VT.isScalableVector() &&
+           "STEP_VECTOR can only be used with scalable types");
+    assert(VT.getScalarSizeInBits() >= 8 &&
+           "STEP_VECTOR can only be used with vectors of integers that are at "
+           "least 8 bits wide");
+    assert(Operand.getValueType().bitsGE(VT.getScalarType()) &&
+           "Operand type should be at least as large as the element type");
+    assert(isa<ConstantSDNode>(Operand) &&
+           cast<ConstantSDNode>(Operand)->getAPIntValue().isNonNegative() &&
+           "Expected positive integer constant for STEP_VECTOR");
+    break;
   case ISD::FREEZE:
     assert(VT == Operand.getValueType() && "Unexpected VT!");
     break;

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ffd4778a4a42..a9de6031a3b6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6945,7 +6945,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::experimental_deoptimize:
     LowerDeoptimizeCall(&I);
     return;
-
+  case Intrinsic::experimental_stepvector:
+    visitStepVector(I);
+    return;
   case Intrinsic::vector_reduce_fadd:
   case Intrinsic::vector_reduce_fmul:
   case Intrinsic::vector_reduce_add:
@@ -10929,6 +10931,16 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
   }
 }
 
+void SelectionDAGBuilder::visitStepVector(const CallInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto DL = getCurSDLoc();
+  EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  EVT OpVT =
+      TLI.getTypeToTransformTo(*DAG.getContext(), ResultVT.getScalarType());
+  SDValue Step = DAG.getConstant(1, DL, OpVT);
+  setValue(&I, DAG.getStepVector(DL, ResultVT, Step));
+}
+
 void SelectionDAGBuilder::visitVectorReverse(const CallInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index a759f8babe33..156f4c00273d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -779,6 +779,7 @@ class SelectionDAGBuilder {
   void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
   void visitVectorReverse(const CallInst &I);
   void visitVectorSplice(const CallInst &I);
+  void visitStepVector(const CallInst &I);
 
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 8402163fd92b..7bba8d30eb3f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -292,6 +292,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::SPLAT_VECTOR:               return "splat_vector";
   case ISD::SPLAT_VECTOR_PARTS:         return "splat_vector_parts";
   case ISD::VECTOR_REVERSE:             return "vector_reverse";
+  case ISD::STEP_VECTOR:                return "step_vector";
   case ISD::CARRY_FALSE:                return "carry_false";
   case ISD::ADDC:                       return "addc";
   case ISD::ADDE:                       return "adde";

diff  --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index d4292b3cfc1b..53174041150c 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -91,6 +91,23 @@ Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) {
              : CreateMul(CI, Scaling);
 }
 
+Value *IRBuilderBase::CreateStepVector(Type *DstType, const Twine &Name) {
+  if (isa<ScalableVectorType>(DstType))
+    return CreateIntrinsic(Intrinsic::experimental_stepvector, {DstType}, {},
+                           nullptr, Name);
+
+  Type *STy = DstType->getScalarType();
+  unsigned NumEls = cast<FixedVectorType>(DstType)->getNumElements();
+
+  // Create a vector of consecutive numbers from zero to VF.
+  SmallVector<Constant *, 8> Indices;
+  for (unsigned i = 0; i < NumEls; ++i)
+    Indices.push_back(ConstantInt::get(STy, i));
+
+  // Add the consecutive indices to the vector value.
+  return ConstantVector::get(Indices);
+}
+
 CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
                                       MaybeAlign Align, bool isVolatile,
                                       MDNode *TBAATag, MDNode *ScopeTag,

diff  --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 77acfc447cab..60689efce625 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5195,6 +5195,15 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     break;
   }
+  case Intrinsic::experimental_stepvector: {
+    VectorType *VecTy = dyn_cast<VectorType>(Call.getType());
+    Assert(VecTy && VecTy->getScalarType()->isIntegerTy() &&
+               VecTy->getScalarSizeInBits() >= 8,
+           "experimental_stepvector only supported for vectors of integers "
+           "with a bitwidth of at least 8.",
+           &Call);
+    break;
+  }
   case Intrinsic::experimental_vector_insert: {
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *SubVecTy = cast<VectorType>(Call.getArgOperand(1)->getType());

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5ab8d8a5d6f1..6c1888b74c7b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1135,6 +1135,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+      setOperationAction(ISD::STEP_VECTOR, VT, Custom);
 
       setOperationAction(ISD::MULHU, VT, Expand);
       setOperationAction(ISD::MULHS, VT, Expand);
@@ -4402,6 +4403,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::SPLAT_VECTOR:
     return LowerSPLAT_VECTOR(Op, DAG);
+  case ISD::STEP_VECTOR:
+    return LowerSTEP_VECTOR(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::INSERT_SUBVECTOR:
@@ -9049,6 +9052,21 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   return GenerateTBL(Op, ShuffleMask, DAG);
 }
 
+SDValue AArch64TargetLowering::LowerSTEP_VECTOR(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  assert(VT.isScalableVector() &&
+         "Only expect scalable vectors for STEP_VECTOR");
+  EVT ElemVT = VT.getScalarType();
+  assert(ElemVT != MVT::i1 &&
+         "Vectors of i1 types not supported for STEP_VECTOR");
+
+  SDValue StepVal = Op.getOperand(0);
+  SDValue Zero = DAG.getConstant(0, dl, StepVal.getValueType());
+  return DAG.getNode(AArch64ISD::INDEX_VECTOR, dl, VT, Zero, StepVal);
+}
+
 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc dl(Op);

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1264d6779924..cd9185be40e4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -936,6 +936,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp,
                               bool OverrideNEON = false) const;

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 23b6978edac1..d199213e140d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -260,6 +260,19 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       return LT.first;
     break;
   }
+  case Intrinsic::experimental_stepvector: {
+    unsigned Cost = 1; // Cost of the `index' instruction
+    auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
+    // Legalisation of illegal vectors involves an `index' instruction plus
+    // (LT.first - 1) vector adds.
+    if (LT.first > 1) {
+      Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
+      unsigned AddCost =
+          getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
+      Cost += AddCost * (LT.first - 1);
+    }
+    return Cost;
+  }
   default:
     break;
   }

diff  --git a/llvm/test/Analysis/CostModel/AArch64/neon-stepvector.ll b/llvm/test/Analysis/CostModel/AArch64/neon-stepvector.ll
new file mode 100644
index 000000000000..c38e7582f9bc
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/neon-stepvector.ll
@@ -0,0 +1,34 @@
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+neon  < %s | FileCheck %s
+
+; Check icmp for legal integer vectors.
+define void @stepvector_legal_int() {
+; CHECK-LABEL: 'stepvector_legal_int'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = call <2 x i64> @llvm.experimental.stepvector.v2i64()
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = call <4 x i32> @llvm.experimental.stepvector.v4i32()
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = call <8 x i16> @llvm.experimental.stepvector.v8i16()
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = call <16 x i8> @llvm.experimental.stepvector.v16i8()
+  %1 = call <2 x i64> @llvm.experimental.stepvector.v2i64()
+  %2 = call <4 x i32> @llvm.experimental.stepvector.v4i32()
+  %3 = call <8 x i16> @llvm.experimental.stepvector.v8i16()
+  %4 = call <16 x i8> @llvm.experimental.stepvector.v16i8()
+  ret void
+}
+
+; Check icmp for an illegal integer vector.
+define void @stepvector_illegal_int() {
+; CHECK-LABEL: 'stepvector_illegal_int'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = call <4 x i64> @llvm.experimental.stepvector.v4i64()
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %2 = call <16 x i32> @llvm.experimental.stepvector.v16i32()
+  %1 = call <4 x i64> @llvm.experimental.stepvector.v4i64()
+  %2 = call <16 x i32> @llvm.experimental.stepvector.v16i32()
+  ret void
+}
+
+
+declare <2 x i64> @llvm.experimental.stepvector.v2i64()
+declare <4 x i32> @llvm.experimental.stepvector.v4i32()
+declare <8 x i16> @llvm.experimental.stepvector.v8i16()
+declare <16 x i8> @llvm.experimental.stepvector.v16i8()
+
+declare <4 x i64> @llvm.experimental.stepvector.v4i64()
+declare <16 x i32> @llvm.experimental.stepvector.v16i32()

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-stepvector.ll b/llvm/test/Analysis/CostModel/AArch64/sve-stepvector.ll
new file mode 100644
index 000000000000..c3109248838b
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-stepvector.ll
@@ -0,0 +1,39 @@
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve  < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; Check icmp for legal integer vectors.
+define void @stepvector_legal_int() {
+; CHECK-LABEL: 'stepvector_legal_int'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
+  %1 = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+  %2 = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+  %3 = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+  %4 = call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
+  ret void
+}
+
+; Check icmp for an illegal integer vector.
+define void @stepvector_illegal_int() {
+; CHECK-LABEL: 'stepvector_illegal_int'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %2 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+  %1 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+  %2 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+  ret void
+}
+
+
+declare <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+declare <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+declare <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+declare <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
+
+declare <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+declare <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()

diff  --git a/llvm/test/CodeGen/AArch64/neon-stepvector.ll b/llvm/test/CodeGen/AArch64/neon-stepvector.ll
new file mode 100644
index 000000000000..05308bf5f6d7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-stepvector.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK
+
+; LEGAL INTEGER TYPES
+
+define <2 x i64> @stepvector_v2i64() {
+; CHECK-LABEL: .LCPI0_0:
+; CHECK-NEXT:    .xword 0
+; CHECK-NEXT:    .xword 1
+; CHECK-LABEL: stepvector_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.experimental.stepvector.v2i64()
+  ret <2 x i64> %0
+}
+
+define <4 x i32> @stepvector_v4i32() {
+; CHECK-LABEL: .LCPI1_0:
+; CHECK-NEXT:    .word 0
+; CHECK-NEXT:    .word 1
+; CHECK-NEXT:    .word 2
+; CHECK-NEXT:    .word 3
+; CHECK-LABEL: stepvector_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <4 x i32> @llvm.experimental.stepvector.v4i32()
+  ret <4 x i32> %0
+}
+
+define <8 x i16> @stepvector_v8i16() {
+; CHECK-LABEL: .LCPI2_0:
+; CHECK-NEXT:    .hword 0
+; CHECK-NEXT:    .hword 1
+; CHECK-NEXT:    .hword 2
+; CHECK-NEXT:    .hword 3
+; CHECK-NEXT:    .hword 4
+; CHECK-NEXT:    .hword 5
+; CHECK-NEXT:    .hword 6
+; CHECK-NEXT:    .hword 7
+; CHECK-LABEL: stepvector_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <8 x i16> @llvm.experimental.stepvector.v8i16()
+  ret <8 x i16> %0
+}
+
+define <16 x i8> @stepvector_v16i8() {
+; CHECK-LABEL: .LCPI3_0:
+; CHECK-NEXT:    .byte 0
+; CHECK-NEXT:    .byte 1
+; CHECK-NEXT:    .byte 2
+; CHECK-NEXT:    .byte 3
+; CHECK-NEXT:    .byte 4
+; CHECK-NEXT:    .byte 5
+; CHECK-NEXT:    .byte 6
+; CHECK-NEXT:    .byte 7
+; CHECK-NEXT:    .byte 8
+; CHECK-NEXT:    .byte 9
+; CHECK-NEXT:    .byte 10
+; CHECK-NEXT:    .byte 11
+; CHECK-NEXT:    .byte 12
+; CHECK-NEXT:    .byte 13
+; CHECK-NEXT:    .byte 14
+; CHECK-NEXT:    .byte 15
+; CHECK-LABEL: stepvector_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <16 x i8> @llvm.experimental.stepvector.v16i8()
+  ret <16 x i8> %0
+}
+
+; ILLEGAL INTEGER TYPES
+
+define <4 x i64> @stepvector_v4i64() {
+; CHECK-LABEL: .LCPI4_0:
+; CHECK-NEXT:    .xword 0
+; CHECK-NEXT:    .xword 1
+; CHECK-LABEL: .LCPI4_1:
+; CHECK-NEXT:    .xword 2
+; CHECK-NEXT:    .xword 3
+; CHECK-LABEL: stepvector_v4i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    adrp x9, .LCPI4_1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI4_1]
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <4 x i64> @llvm.experimental.stepvector.v4i64()
+  ret <4 x i64> %0
+}
+
+define <16 x i32> @stepvector_v16i32() {
+; CHECK-LABEL: .LCPI5_0:
+; CHECK-NEXT:    .word 0
+; CHECK-NEXT:    .word 1
+; CHECK-NEXT:    .word 2
+; CHECK-NEXT:    .word 3
+; CHECK-LABEL: .LCPI5_1:
+; CHECK-NEXT:    .word 4
+; CHECK-NEXT:    .word 5
+; CHECK-NEXT:    .word 6
+; CHECK-NEXT:    .word 7
+; CHECK-LABEL: .LCPI5_2:
+; CHECK-NEXT:    .word 8
+; CHECK-NEXT:    .word 9
+; CHECK-NEXT:    .word 10
+; CHECK-NEXT:    .word 11
+; CHECK-LABEL: .LCPI5_3:
+; CHECK-NEXT:    .word 12
+; CHECK-NEXT:    .word 13
+; CHECK-NEXT:    .word 14
+; CHECK-NEXT:    .word 15
+; CHECK-LABEL: stepvector_v16i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI5_0
+; CHECK-NEXT:    adrp x9, .LCPI5_1
+; CHECK-NEXT:    adrp x10, .LCPI5_2
+; CHECK-NEXT:    adrp x11, .LCPI5_3
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI5_1]
+; CHECK-NEXT:    ldr q2, [x10, :lo12:.LCPI5_2]
+; CHECK-NEXT:    ldr q3, [x11, :lo12:.LCPI5_3]
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <16 x i32> @llvm.experimental.stepvector.v16i32()
+  ret <16 x i32> %0
+}
+
+define <2 x i32> @stepvector_v2i32() {
+; CHECK-LABEL: .LCPI6_0:
+; CHECK-NEXT:    .word 0
+; CHECK-NEXT:    .word 1
+; CHECK-LABEL: stepvector_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI6_0
+; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI6_0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <2 x i32> @llvm.experimental.stepvector.v2i32()
+  ret <2 x i32> %0
+}
+
+define <4 x i16> @stepvector_v4i16() {
+; CHECK-LABEL: .LCPI7_0:
+; CHECK-NEXT:    .hword 0
+; CHECK-NEXT:    .hword 1
+; CHECK-NEXT:    .hword 2
+; CHECK-NEXT:    .hword 3
+; CHECK-LABEL: stepvector_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI7_0
+; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI7_0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <4 x i16> @llvm.experimental.stepvector.v4i16()
+  ret <4 x i16> %0
+}
+
+
+declare <2 x i64> @llvm.experimental.stepvector.v2i64()
+declare <4 x i32> @llvm.experimental.stepvector.v4i32()
+declare <8 x i16> @llvm.experimental.stepvector.v8i16()
+declare <16 x i8> @llvm.experimental.stepvector.v16i8()
+
+declare <4 x i64> @llvm.experimental.stepvector.v4i64()
+declare <16 x i32> @llvm.experimental.stepvector.v16i32()
+declare <2 x i32> @llvm.experimental.stepvector.v2i32()
+declare <4 x i16> @llvm.experimental.stepvector.v4i16()

diff  --git a/llvm/test/CodeGen/AArch64/sve-stepvector.ll b/llvm/test/CodeGen/AArch64/sve-stepvector.ll
new file mode 100644
index 000000000000..1b162ec577c2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-stepvector.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s --check-prefixes=CHECK
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s < %t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; LEGAL INTEGER TYPES
+
+define <vscale x 2 x i64> @stepvector_nxv2i64() {
+; CHECK-LABEL: stepvector_nxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @stepvector_nxv4i32() {
+; CHECK-LABEL: stepvector_nxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 8 x i16> @stepvector_nxv8i16() {
+; CHECK-LABEL: stepvector_nxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 16 x i8> @stepvector_nxv16i8() {
+; CHECK-LABEL: stepvector_nxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.b, #0, #1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
+  ret <vscale x 16 x i8> %0
+}
+
+; ILLEGAL INTEGER TYPES
+
+define <vscale x 4 x i64> @stepvector_nxv4i64() {
+; CHECK-LABEL: stepvector_nxv4i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    add z1.d, z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+  ret <vscale x 4 x i64> %0
+}
+
+define <vscale x 16 x i32> @stepvector_nxv16i32() {
+; CHECK-LABEL: stepvector_nxv16i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntw x9
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z1.s, w9
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    add z1.s, z0.s, z1.s
+; CHECK-NEXT:    add z2.s, z0.s, z3.s
+; CHECK-NEXT:    add z3.s, z1.s, z3.s
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+  ret <vscale x 16 x i32> %0
+}
+
+define <vscale x 2 x i32> @stepvector_nxv2i32() {
+; CHECK-LABEL: stepvector_nxv2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
+  ret <vscale x 2 x i32> %0
+}
+
+define <vscale x 4 x i16> @stepvector_nxv4i16() {
+; CHECK-LABEL: stepvector_nxv4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
+  ret <vscale x 4 x i16> %0
+}
+
+define <vscale x 8 x i8> @stepvector_nxv8i8() {
+; CHECK-LABEL: stepvector_nxv8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
+  ret <vscale x 8 x i8> %0
+}
+
+declare <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+declare <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+declare <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+declare <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
+
+declare <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+declare <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+declare <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
+declare <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
+declare <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()

diff  --git a/llvm/test/Verifier/stepvector-intrinsic.ll b/llvm/test/Verifier/stepvector-intrinsic.ll
new file mode 100644
index 000000000000..f6852dd75e7c
--- /dev/null
+++ b/llvm/test/Verifier/stepvector-intrinsic.ll
@@ -0,0 +1,29 @@
+; RUN: not opt -S -verify < %s 2>&1 | FileCheck %s
+
+; Reject stepvector intrinsics that return a scalar
+
+define i32 @stepvector_i32() {
+; CHECK: Intrinsic has incorrect return type!
+  %1 = call i32 @llvm.experimental.stepvector.i32()
+  ret i32 %1
+}
+
+; Reject vectors with non-integer elements
+
+define <vscale x 4 x float> @stepvector_float() {
+; CHECK: experimental_stepvector only supported for vectors of integers with a bitwidth of at least 8
+  %1 = call <vscale x 4 x float> @llvm.experimental.stepvector.nxv4f32()
+  ret <vscale x 4 x float> %1
+}
+
+; Reject vectors of integers less than 8 bits in width
+
+define <vscale x 16 x i1> @stepvector_i1() {
+; CHECK: experimental_stepvector only supported for vectors of integers with a bitwidth of at least 8
+  %1 = call <vscale x 16 x i1> @llvm.experimental.stepvector.nxv16i1()
+  ret <vscale x 16 x i1> %1
+}
+
+declare i32 @llvm.experimental.stepvector.i32()
+declare <vscale x 4 x float> @llvm.experimental.stepvector.nxv4f32()
+declare <vscale x 16 x i1> @llvm.experimental.stepvector.nxv16i1()

diff  --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index e9354dafd17b..f7543d8ded8a 100644
--- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -648,4 +648,18 @@ TEST_F(AArch64SelectionDAGTest, getTypeConversion_NoScalarizeEVT_nxv1f128) {
   EXPECT_DEATH(getTypeAction(FromVT), "Cannot legalize this vector");
 }
 
+TEST_F(AArch64SelectionDAGTest, TestFold_STEP_VECTOR) {
+  if (!TM)
+    return;
+
+  SDLoc Loc;
+  auto IntVT = EVT::getIntegerVT(Context, 8);
+  auto VecVT = EVT::getVectorVT(Context, MVT::i8, 16, true);
+
+  // Should create SPLAT_VECTOR
+  SDValue Zero = DAG->getConstant(0, Loc, IntVT);
+  SDValue Op = DAG->getNode(ISD::STEP_VECTOR, Loc, VecVT, Zero);
+  EXPECT_EQ(Op.getOpcode(), ISD::SPLAT_VECTOR);
+}
+
 } // end namespace llvm

diff  --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 2e5b050ad5c0..ef3895d1aba9 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -180,6 +180,32 @@ TEST_F(IRBuilderTest, IntrinsicsWithScalableVectors) {
     EXPECT_EQ(FTy->getParamType(i), ArgTys[i]->getType());
 }
 
+TEST_F(IRBuilderTest, CreateStepVector) {
+  IRBuilder<> Builder(BB);
+
+  // Fixed width vectors
+  Type *DstVecTy = VectorType::get(Builder.getInt32Ty(), 4, false);
+  Value *StepVec = Builder.CreateStepVector(DstVecTy);
+  EXPECT_TRUE(isa<Constant>(StepVec));
+  EXPECT_EQ(StepVec->getType(), DstVecTy);
+
+  const auto *VectorValue = cast<Constant>(StepVec);
+  for (unsigned i = 0; i < 4; i++) {
+    EXPECT_TRUE(isa<ConstantInt>(VectorValue->getAggregateElement(i)));
+    ConstantInt *El = cast<ConstantInt>(VectorValue->getAggregateElement(i));
+    EXPECT_EQ(El->getValue(), i);
+  }
+
+  // Scalable vectors
+  DstVecTy = VectorType::get(Builder.getInt32Ty(), 4, true);
+  StepVec = Builder.CreateStepVector(DstVecTy);
+  EXPECT_TRUE(isa<CallInst>(StepVec));
+  CallInst *Call = cast<CallInst>(StepVec);
+  FunctionType *FTy = Call->getFunctionType();
+  EXPECT_EQ(FTy->getReturnType(), DstVecTy);
+  EXPECT_EQ(Call->getIntrinsicID(), Intrinsic::experimental_stepvector);
+}
+
 TEST_F(IRBuilderTest, ConstrainedFP) {
   IRBuilder<> Builder(BB);
   Value *V;