[llvm] c5e6c88 - [VP][SelectionDAG][RISCV] Add get_vector_length intrinsics and generic SelectionDAG support.

Fri May 26 09:06:45 PDT 2023

Author: Craig Topper
Date: 2023-05-26T09:06:38-07:00
New Revision: c5e6c886aabb36ab66b4ed835da243c2a3455ade

URL: https://github.com/llvm/llvm-project/commit/c5e6c886aabb36ab66b4ed835da243c2a3455ade
DIFF: https://github.com/llvm/llvm-project/commit/c5e6c886aabb36ab66b4ed835da243c2a3455ade.diff

LOG: [VP][SelectionDAG][RISCV] Add get_vector_length intrinsics and generic SelectionDAG support.

The generic implementation is umin(TC, VF * vscale).

Lowering to vsetvli for RISC-V will come in a future patch.

This patch is a pre-requisite to be able to CodeGen vectorized code from
D99750.

Reviewed By: reames, frasercrmck

Differential Revision: https://reviews.llvm.org/D149916

Added: 
    llvm/test/CodeGen/AArch64/get_vector_length.ll
    llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
    llvm/test/Verifier/get_vector_length.ll

Modified: 
    llvm/docs/LangRef.rst
    llvm/include/llvm/CodeGen/SelectionDAG.h
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/include/llvm/IR/Intrinsics.td
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/IR/Verifier.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 83f0e350a82da..98f5c8c8a53c1 100644

--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18069,6 +18069,54 @@ Arguments:
 None.
 
 
+'``llvm.experimental.get.vector.length``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.experimental.get.vector.length.i32(i32 %cnt, i32 immarg %vf, i1 immarg %scalable)
+      declare i32 @llvm.experimental.get.vector.length.i64(i64 %cnt, i32 immarg %vf, i1 immarg %scalable)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.get.vector.length.*``' intrinsics take a number of
+elements to process and returns how many of the elements can be processed
+with the requested vectorization factor.
+
+Arguments:
+""""""""""
+
+The first argument is an unsigned value of any scalar integer type and specifies
+the total number of elements to be processed. The second argument is an i32
+immediate for the vectorization factor. The third argument indicates if the
+vectorization factor should be multiplied by vscale.
+
+Semantics:
+""""""""""
+
+Returns a positive i32 value (explicit vector length) that is unknown at compile
+time and depends on the hardware specification.
+If the result value does not fit in the result type, then the result is
+a :ref:`poison value <poisonvalues>`.
+
+This intrinsic is intended to be used by loop vectorization with VP intrinsics
+in order to get the number of elements to process on each loop iteration. The
+result should be used to decrease the count for the next iteration until the
+count reaches zero.
+
+If the count is larger than the number of lanes in the type described by the
+last 2 arguments, this intrinsic may return a value less than the number of
+lanes implied by the type. The result will be at least as large as the result
+will be on any later loop iteration.
+
+This intrinsic will only return 0 if the input count is also 0. A non-zero input
+count will produce a non-zero result.
+
 Matrix Intrinsics
 -----------------
 

diff  --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 90208e65de0d2..fb69fdd80b106 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1066,6 +1066,9 @@ class SelectionDAG {
   SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm,
                     bool ConstantFold = true);
 
+  SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
+                          bool ConstantFold = true);
+
   /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
   SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
     return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);

diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index dded1e59e9452..b2d73b286b0ad 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -459,6 +459,11 @@ class TargetLoweringBase {
     return true;
   }
 
+  virtual bool shouldExpandGetVectorLength(EVT CountVT, unsigned VF,
+                                           bool IsScalable) const {
+    return true;
+  }
+
   // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to
   // vecreduce(op(x, y)) for the reduction opcode RedOpc.
   virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const {

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 9b49df277dd92..86f81124b8464 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2149,6 +2149,12 @@ def int_get_active_lane_mask:
             [llvm_anyint_ty, LLVMMatchType<1>],
             [IntrNoMem, IntrNoSync, IntrWillReturn]>;
 
+def int_experimental_get_vector_length:
+  DefaultAttrsIntrinsic<[llvm_i32_ty],
+                        [llvm_anyint_ty, llvm_i32_ty, llvm_i1_ty],
+                        [IntrNoMem, IntrNoSync, IntrWillReturn,
+                         ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+
 def int_experimental_vp_splice:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>,

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ffa9998949b40..473497af56748 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1956,6 +1956,15 @@ SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm,
   return getNode(ISD::VSCALE, DL, VT, getConstant(MulImm, DL, VT));
 }
 
+SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
+                                      bool ConstantFold) {
+  if (EC.isScalable())
+    return getVScale(DL, VT,
+                     APInt(VT.getSizeInBits(), EC.getKnownMinValue()));
+
+  return getConstant(EC.getKnownMinValue(), DL, VT);
+}
+
 SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
   APInt One(ResVT.getScalarSizeInBits(), 1);
   return getStepVector(DL, ResVT, One);

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e5e8175602865..1621ae4858229 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7319,6 +7319,40 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, SetCC);
     return;
   }
+  case Intrinsic::experimental_get_vector_length: {
+    assert(cast<ConstantInt>(I.getOperand(1))->getSExtValue() > 0 &&
+           "Expected positive VF");
+    unsigned VF = cast<ConstantInt>(I.getOperand(1))->getZExtValue();
+    bool IsScalable = cast<ConstantInt>(I.getOperand(2))->isOne();
+
+    SDValue Count = getValue(I.getOperand(0));
+    EVT CountVT = Count.getValueType();
+
+    if (!TLI.shouldExpandGetVectorLength(CountVT, VF, IsScalable)) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return;
+    }
+
+    // Expand to a umin between the trip count and the maximum elements the type
+    // can hold.
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+    // Extend the trip count to at least the result VT.
+    if (CountVT.bitsLT(VT)) {
+      Count = DAG.getNode(ISD::ZERO_EXTEND, sdl, VT, Count);
+      CountVT = VT;
+    }
+
+    SDValue MaxEVL = DAG.getElementCount(sdl, CountVT,
+                                         ElementCount::get(VF, IsScalable));
+
+    SDValue UMin = DAG.getNode(ISD::UMIN, sdl, CountVT, Count, MaxEVL);
+    // Clip to the result type if needed.
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, sdl, VT, UMin);
+
+    setValue(&I, Trunc);
+    return;
+  }
   case Intrinsic::vector_insert: {
     SDValue Vec = getValue(I.getOperand(0));
     SDValue SubVec = getValue(I.getOperand(1));

diff  --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 24333dd7c94c3..4ec940f2a67ee 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5469,6 +5469,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           Call);
     break;
   }
+  case Intrinsic::experimental_get_vector_length: {
+    ConstantInt *VF = cast<ConstantInt>(Call.getArgOperand(1));
+    Check(!VF->isNegative() && !VF->isZero(),
+          "get_vector_length: VF must be positive", Call);
+    break;
+  }
   case Intrinsic::masked_load: {
     Check(Call.getType()->isVectorTy(), "masked_load: must return a vector",
           Call);

diff  --git a/llvm/test/CodeGen/AArch64/get_vector_length.ll b/llvm/test/CodeGen/AArch64/get_vector_length.ll
new file mode 100644
index 0000000000000..3a83edb339b84
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/get_vector_length.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s
+
+declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i1)
+declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
+declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i1)
+
+define i32 @vector_length_i16(i16 zeroext %tc) {
+; CHECK-LABEL: vector_length_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 2, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_i32(i32 zeroext %tc) {
+; CHECK-LABEL: vector_length_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    csel w0, w0, w8, lo
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_i64(i64 %tc) {
+; CHECK-LABEL: vector_length_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    cmp x0, x8
+; CHECK-NEXT:    csel x0, x0, x8, lo
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i64(i64 %tc, i32 2, i1 true)
+  ret i32 %a
+}

diff  --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
new file mode 100644
index 0000000000000..b002cbc6cd4d5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i1)
+declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
+declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i1)
+
+define i32 @vector_length_i16(i16 zeroext %tc) {
+; CHECK-LABEL: vector_length_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    bltu a0, a1, .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 2, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_i32(i32 zeroext %tc) {
+; RV32-LABEL: vector_length_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    bltu a0, a1, .LBB1_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_length_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    bltu a0, a1, .LBB1_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB1_2:
+; RV64-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_XLen(iXLen zeroext %tc) {
+; RV32-LABEL: vector_length_XLen:
+; RV32:       # %bb.0:
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    bltu a0, a1, .LBB2_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB2_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_length_XLen:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    bltu a0, a1, .LBB2_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB2_2:
+; RV64-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_i16_fixed(i16 zeroext %tc) {
+; CHECK-LABEL: vector_length_i16_fixed:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 2
+; CHECK-NEXT:    bltu a0, a1, .LBB3_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:  .LBB3_2:
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 2, i1 false)
+  ret i32 %a
+}
+
+define i32 @vector_length_i32_fixed(i32 zeroext %tc) {
+; RV32-LABEL: vector_length_i32_fixed:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 2
+; RV32-NEXT:    bltu a0, a1, .LBB4_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a0, 2
+; RV32-NEXT:  .LBB4_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_length_i32_fixed:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    li a1, 2
+; RV64-NEXT:    bltu a0, a1, .LBB4_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a0, 2
+; RV64-NEXT:  .LBB4_2:
+; RV64-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 false)
+  ret i32 %a
+}
+
+define i32 @vector_length_XLen_fixed(iXLen zeroext %tc) {
+; RV32-LABEL: vector_length_XLen_fixed:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 2
+; RV32-NEXT:    bltu a0, a1, .LBB5_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a0, 2
+; RV32-NEXT:  .LBB5_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_length_XLen_fixed:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    li a1, 2
+; RV64-NEXT:    bltu a0, a1, .LBB5_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a0, 2
+; RV64-NEXT:  .LBB5_2:
+; RV64-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 false)
+  ret i32 %a
+}

diff  --git a/llvm/test/Verifier/get_vector_length.ll b/llvm/test/Verifier/get_vector_length.ll
new file mode 100644
index 0000000000000..2fb2e089cd69f
--- /dev/null
+++ b/llvm/test/Verifier/get_vector_length.ll
@@ -0,0 +1,17 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
+
+define i32 @vector_length_negative_vf(i32 zeroext %tc) {
+  ; CHECK: get_vector_length: VF must be positive
+  ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 -1, i1 true)
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 -1, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_zero_vf(i32 zeroext %tc) {
+  ; CHECK: get_vector_length: VF must be positive
+  ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 0, i1 true)
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 0, i1 true)
+  ret i32 %a
+}