[llvm] b64ddae - [RISCV] Lower experimental_get_vector_length intrinsic to vsetvli for some cases.

Mon Jun 5 15:02:21 PDT 2023

Author: Craig Topper
Date: 2023-06-05T15:02:11-07:00
New Revision: b64ddae8a294605819470ce2f8d8b4751d0ffe12

URL: https://github.com/llvm/llvm-project/commit/b64ddae8a294605819470ce2f8d8b4751d0ffe12
DIFF: https://github.com/llvm/llvm-project/commit/b64ddae8a294605819470ce2f8d8b4751d0ffe12.diff

LOG: [RISCV] Lower experimental_get_vector_length intrinsic to vsetvli for some cases.

This patch lowers to vsetvli when the AVL is i32 or XLenVT and
the VF is a power of 2 in the range [1, 64]. VLEN=32 is not supported
as we don't have a valid type mapping for that. VF=1 is not supported
with Zve32* only.

The element width is used to set the SEW for the vsetvli if possible.
Otherwise we use SEW=8.

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D150824

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e8b6560036f08..acdbaccdaed87 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1175,6 +1175,33 @@ MVT RISCVTargetLowering::getVPExplicitVectorLengthTy() const {
   return Subtarget.getXLenVT();
 }
 
+// Return false if we can lower get_vector_length to a vsetvli intrinsic.
+bool RISCVTargetLowering::shouldExpandGetVectorLength(EVT TripCountVT,
+                                                      unsigned VF,
+                                                      bool IsScalable) const {
+  if (!Subtarget.hasVInstructions())
+    return true;
+
+  if (!IsScalable)
+    return true;
+
+  if (TripCountVT != MVT::i32 && TripCountVT != Subtarget.getXLenVT())
+    return true;
+
+  // Don't allow VF=1 if those types are't legal.
+  if (VF < RISCV::RVVBitsPerBlock / Subtarget.getELEN())
+    return true;
+
+  // VLEN=32 support is incomplete.
+  if (Subtarget.getRealMinVLen() < RISCV::RVVBitsPerBlock)
+    return true;
+
+  // The maximum VF is for the smallest element width with LMUL=8.
+  // VF must be a power of 2.
+  unsigned MaxVF = (RISCV::RVVBitsPerBlock / 8) * 8;
+  return VF > MaxVF || !isPowerOf2_32(VF);
+}
+
 bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                              const CallInst &I,
                                              MachineFunction &MF,
@@ -6623,6 +6650,48 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands);
 }
 
+// Lower the llvm.get.vector.length intrinsic to vsetvli. We only support
+// scalable vector llvm.get.vector.length for now.
+//
+// We need to convert from a scalable VF to a vsetvli with VLMax equal to
+// (vscale * VF). The vscale and VF are independent of element width. We use
+// SEW=8 for the vsetvli because it is the only element width that supports all
+// fractional LMULs. The LMUL is choosen so that with SEW=8 the VLMax is
+// (vscale * VF). Where vscale is defined as VLEN/RVVBitsPerBlock. The
+// InsertVSETVLI pass can fix up the vtype of the vsetvli if a 
diff erent
+// SEW and LMUL are better for the surrounding vector instructions.
+static SDValue lowerGetVectorLength(SDNode *N, SelectionDAG &DAG,
+                                    const RISCVSubtarget &Subtarget) {
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  // The smallest LMUL is only valid for the smallest element width.
+  const unsigned ElementWidth = 8;
+
+  // Determine the VF that corresponds to LMUL 1 for ElementWidth.
+  unsigned LMul1VF = RISCV::RVVBitsPerBlock / ElementWidth;
+  // We don't support VF==1 with ELEN==32.
+  unsigned MinVF = RISCV::RVVBitsPerBlock / Subtarget.getELEN();
+
+  unsigned VF = N->getConstantOperandVal(2);
+  assert(VF >= MinVF && VF <= (LMul1VF * 8) && isPowerOf2_32(VF) &&
+         "Unexpected VF");
+
+  bool Fractional = VF < LMul1VF;
+  unsigned LMulVal = Fractional ? LMul1VF / VF : VF / LMul1VF;
+  unsigned VLMUL = (unsigned)RISCVVType::encodeLMUL(LMulVal, Fractional);
+  unsigned VSEW = RISCVVType::encodeSEW(ElementWidth);
+
+  SDLoc DL(N);
+
+  SDValue LMul = DAG.getTargetConstant(VLMUL, DL, XLenVT);
+  SDValue Sew = DAG.getTargetConstant(VSEW, DL, XLenVT);
+
+  SDValue AVL = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, N->getOperand(1));
+
+  SDValue ID = DAG.getTargetConstant(Intrinsic::riscv_vsetvli, DL, XLenVT);
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XLenVT, ID, AVL, Sew, LMul);
+}
+
 SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
   unsigned IntNo = Op.getConstantOperandVal(0);
@@ -6648,6 +6717,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         IntNo == Intrinsic::riscv_zip ? RISCVISD::ZIP : RISCVISD::UNZIP;
     return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1));
   }
+  case Intrinsic::experimental_get_vector_length:
+    return lowerGetVectorLength(Op.getNode(), DAG, Subtarget);
   case Intrinsic::riscv_vmv_x_s:
     assert(Op.getValueType() == XLenVT && "Unexpected VT!");
     return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(),
@@ -9471,6 +9542,11 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     default:
       llvm_unreachable(
           "Don't know how to custom type legalize this intrinsic!");
+    case Intrinsic::experimental_get_vector_length: {
+      SDValue Res = lowerGetVectorLength(N, DAG, Subtarget);
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+      return;
+    }
     case Intrinsic::riscv_orc_b: {
       SDValue NewOp =
           DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 69d5dffa15d98..65c9cd763c6f3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -870,6 +870,9 @@ class RISCVTargetLowering : public TargetLowering {
 
   MVT getVPExplicitVectorLengthTy() const override;
 
+  bool shouldExpandGetVectorLength(EVT TripCountVT, unsigned VF,
+                                   bool IsScalable) const override;
+
   /// RVV code generation for fixed length vectors does not lower all
   /// BUILD_VECTORs. This makes BUILD_VECTOR legalisation a source of stores to
   /// merge. However, merging them creates a BUILD_VECTOR that is just as

diff  --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
index b002cbc6cd4d5..1d42b6e3937c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i1)
 declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
@@ -21,51 +21,19 @@ define i32 @vector_length_i16(i16 zeroext %tc) {
 }
 
 define i32 @vector_length_i32(i32 zeroext %tc) {
-; RV32-LABEL: vector_length_i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a1, a1, 2
-; RV32-NEXT:    bltu a0, a1, .LBB1_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:  .LBB1_2:
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vector_length_i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a1, a1, 2
-; RV64-NEXT:    bltu a0, a1, .LBB1_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB1_2:
-; RV64-NEXT:    ret
+; CHECK-LABEL: vector_length_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    ret
   %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 true)
   ret i32 %a
 }
 
 define i32 @vector_length_XLen(iXLen zeroext %tc) {
-; RV32-LABEL: vector_length_XLen:
-; RV32:       # %bb.0:
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a1, a1, 2
-; RV32-NEXT:    bltu a0, a1, .LBB2_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:  .LBB2_2:
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vector_length_XLen:
-; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a1, a1, 2
-; RV64-NEXT:    bltu a0, a1, .LBB2_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:  .LBB2_2:
-; RV64-NEXT:    ret
+; CHECK-LABEL: vector_length_XLen:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    ret
   %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 true)
   ret i32 %a
 }
@@ -128,3 +96,237 @@ define i32 @vector_length_XLen_fixed(iXLen zeroext %tc) {
   %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 false)
   ret i32 %a
 }
+
+define i32 @vector_length_vf1_i32(i32 zeroext %tc) {
+; CHECK-LABEL: vector_length_vf1_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 1, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf1_XLen(iXLen zeroext %tc) {
+; CHECK-LABEL: vector_length_vf1_XLen:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 1, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf2_i32(i32 zeroext %tc) {
+; CHECK-LABEL: vector_length_vf2_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf2_XLen(iXLen zeroext %tc) {
+; CHECK-LABEL: vector_length_vf2_XLen:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf4_i32(i32 zeroext %tc) {
+; CHECK-LABEL: vector_length_vf4_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 4, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf4_XLen(iXLen zeroext %tc) {
+; CHECK-LABEL: vector_length_vf4_XLen:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 4, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf8_i32(i32 zeroext %tc) {
+; CHECK-LABEL: vector_length_vf8_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf8_XLen(iXLen zeroext %tc) {
+; CHECK-LABEL: vector_length_vf8_XLen:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 8, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf16_i32(i32 zeroext %tc) {
+; CHECK-LABEL: vector_length_vf16_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 16, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf16_XLen(iXLen zeroext %tc) {
+; CHECK-LABEL: vector_length_vf16_XLen:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 16, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf32_i32(i32 zeroext %tc) {
+; CHECK-LABEL: vector_length_vf32_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 32, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf32_XLen(iXLen zeroext %tc) {
+; CHECK-LABEL: vector_length_vf32_XLen:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 32, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf64_i32(i32 zeroext %tc) {
+; CHECK-LABEL: vector_length_vf64_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 64, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf64_XLen(iXLen zeroext %tc) {
+; CHECK-LABEL: vector_length_vf64_XLen:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 64, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf128_i32(i32 zeroext %tc) {
+; RV32-LABEL: vector_length_vf128_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    bltu a0, a1, .LBB20_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB20_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_length_vf128_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    bltu a0, a1, .LBB20_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB20_2:
+; RV64-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 128, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf128_XLen(iXLen zeroext %tc) {
+; RV32-LABEL: vector_length_vf128_XLen:
+; RV32:       # %bb.0:
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    bltu a0, a1, .LBB21_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB21_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_length_vf128_XLen:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    bltu a0, a1, .LBB21_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB21_2:
+; RV64-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 128, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf3_i32(i32 zeroext %tc) {
+; RV32-LABEL: vector_length_vf3_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    srli a1, a1, 3
+; RV32-NEXT:    slli a2, a1, 1
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    bltu a0, a1, .LBB22_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB22_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_length_vf3_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    srli a1, a1, 3
+; RV64-NEXT:    slli a2, a1, 1
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    bltu a0, a1, .LBB22_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB22_2:
+; RV64-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 3, i1 true)
+  ret i32 %a
+}
+
+define i32 @vector_length_vf3_XLen(iXLen zeroext %tc) {
+; RV32-LABEL: vector_length_vf3_XLen:
+; RV32:       # %bb.0:
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    srli a1, a1, 3
+; RV32-NEXT:    slli a2, a1, 1
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    bltu a0, a1, .LBB23_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB23_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_length_vf3_XLen:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    srli a1, a1, 3
+; RV64-NEXT:    slli a2, a1, 1
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    bltu a0, a1, .LBB23_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB23_2:
+; RV64-NEXT:    ret
+  %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 3, i1 true)
+  ret i32 %a
+}