[llvm] 0eeab8b - [RISCV] Add -riscv-v-fixed-length-vector-elen-max to limit the ELEN used for fixed length vectorization.

Fri Aug 27 10:19:22 PDT 2021

Author: Craig Topper
Date: 2021-08-27T10:17:35-07:00
New Revision: 0eeab8b2825ca9582b211fb5fbe782f702b30db7

URL: https://github.com/llvm/llvm-project/commit/0eeab8b2825ca9582b211fb5fbe782f702b30db7
DIFF: https://github.com/llvm/llvm-project/commit/0eeab8b2825ca9582b211fb5fbe782f702b30db7.diff

LOG: [RISCV] Add -riscv-v-fixed-length-vector-elen-max to limit the ELEN used for fixed length vectorization.

This adds an ELEN limit for fixed length vectors. This will scalarize
any elements larger than this. It will also disable some fractional
LMULs. For example, if ELEN=32 then mf8 becomes illegal, i32/f32
vectors can't use any fractional LMULs, i16/f16 can only use mf2,
and i8 can use mf2 and mf4.

We may also need something for the scalable vectors, but that has
interactions with the intrinsics and we can't scalarize a scalable
vector.

Longer term this should come from one of the Zve* features

Added: 
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVSubtarget.cpp
    llvm/lib/Target/RISCV/RISCVSubtarget.h
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 07174327f1dae..790f795b9939a 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1204,8 +1204,10 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
 
   unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
 
+  MVT EltVT = VT.getVectorElementType();
+
   // Don't use RVV for vectors we cannot scalarize if required.
-  switch (VT.getVectorElementType().SimpleTy) {
+  switch (EltVT.SimpleTy) {
   // i1 is supported but has 
diff erent rules.
   default:
     return false;
@@ -1234,6 +1236,10 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
     break;
   }
 
+  // Reject elements larger than ELEN.
+  if (EltVT.getSizeInBits() > Subtarget.getMaxELENForFixedLengthVectors())
+    return false;
+
   unsigned LMul = divideCeil(VT.getSizeInBits(), MinVLen);
   // Don't use RVV for types that don't fit.
   if (LMul > Subtarget.getMaxLMULForFixedLengthVectors())
@@ -1260,6 +1266,7 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
          "Expected legal fixed length vector!");
 
   unsigned MinVLen = Subtarget.getMinRVVVectorSizeInBits();
+  unsigned MaxELen = Subtarget.getMaxELENForFixedLengthVectors();
 
   MVT EltVT = VT.getVectorElementType();
   switch (EltVT.SimpleTy) {
@@ -1274,10 +1281,12 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
   case MVT::f32:
   case MVT::f64: {
     // We prefer to use LMUL=1 for VLEN sized types. Use fractional lmuls for
-    // narrower types, but we can't have a fractional LMUL with demoninator less
-    // than 64/SEW.
+    // narrower types. The smallest fractional LMUL we support is 8/ELEN. Within
+    // each fractional LMUL we support SEW between 8 and LMUL*ELEN.
     unsigned NumElts =
-        divideCeil(VT.getVectorNumElements(), MinVLen / RISCV::RVVBitsPerBlock);
+        (VT.getVectorNumElements() * RISCV::RVVBitsPerBlock) / MinVLen;
+    NumElts = std::max(NumElts, RISCV::RVVBitsPerBlock / MaxELen);
+    assert(isPowerOf2_32(NumElts) && "Expected power of 2 NumElts");
     return MVT::getScalableVectorVT(EltVT, NumElts);
   }
   }

diff  --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index b19fdcb0082bd..56437b79eec6d 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -45,6 +45,11 @@ static cl::opt<unsigned> RVVVectorLMULMax(
              "Fractional LMUL values are not supported."),
     cl::init(8), cl::Hidden);
 
+static cl::opt<unsigned> RVVVectorELENMax(
+    "riscv-v-fixed-length-vector-elen-max",
+    cl::desc("The maximum ELEN value to use for fixed length vectors."),
+    cl::init(64), cl::Hidden);
+
 void RISCVSubtarget::anchor() {}
 
 RISCVSubtarget &
@@ -142,7 +147,18 @@ unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const {
          "Tried to get maximum LMUL without V extension support!");
   assert(RVVVectorLMULMax <= 8 && isPowerOf2_32(RVVVectorLMULMax) &&
          "V extension requires a LMUL to be at most 8 and a power of 2!");
-  return PowerOf2Floor(std::max<unsigned>(RVVVectorLMULMax, 1));
+  return PowerOf2Floor(
+      std::max<unsigned>(std::min<unsigned>(RVVVectorLMULMax, 8), 1));
+}
+
+unsigned RISCVSubtarget::getMaxELENForFixedLengthVectors() const {
+  assert(hasStdExtV() &&
+         "Tried to get maximum ELEN without V extension support!");
+  assert(RVVVectorELENMax <= 64 && RVVVectorELENMax >= 8 &&
+         isPowerOf2_32(RVVVectorELENMax) &&
+         "V extension requires a ELEN to be a power of 2 between 8 and 64!");
+  return PowerOf2Floor(
+      std::max<unsigned>(std::min<unsigned>(RVVVectorELENMax, 64), 8));
 }
 
 bool RISCVSubtarget::useRVVForFixedLengthVectors() const {

diff  --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index ce36331e044d1..cf33ebf8cc514 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -158,6 +158,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   unsigned getMaxRVVVectorSizeInBits() const;
   unsigned getMinRVVVectorSizeInBits() const;
   unsigned getMaxLMULForFixedLengthVectors() const;
+  unsigned getMaxELENForFixedLengthVectors() const;
   bool useRVVForFixedLengthVectors() const;
 };
 } // End llvm namespace

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 7be85cf09d5f7..1c475bd2f8279 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -104,6 +104,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
       return false;
 
+    // Don't allow elements larger than the ELEN.
+    // FIXME: How to limit for scalable vectors?
+    if (isa<FixedVectorType>(DataType) &&
+        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+      return false;
+
     if (Alignment <
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;
@@ -126,6 +132,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
       return false;
 
+    // Don't allow elements larger than the ELEN.
+    // FIXME: How to limit for scalable vectors?
+    if (isa<FixedVectorType>(DataType) &&
+        DataType->getScalarSizeInBits() > ST->getMaxELENForFixedLengthVectors())
+      return false;
+
     if (Alignment <
         DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
       return false;

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
new file mode 100644
index 0000000000000..8e78e89f1fb90
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-elen-max=32 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-elen-max=32 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+; Test that limiting ELEN, scalarizes elements larger than that and disables
+; some fractional LMULs.
+
+; This should use LMUL=1.
+define void @add_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
+; CHECK-LABEL: add_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i32>, <4 x i32>* %x
+  %b = load <4 x i32>, <4 x i32>* %y
+  %c = add <4 x i32> %a, %b
+  store <4 x i32> %c, <4 x i32>* %x
+  ret void
+}
+
+; i64 vectors should be scalarized
+define void @add_v2i64(<2 x i64>* %x, <2 x i64>* %y) {
+; RV32-LABEL: add_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 8(a0)
+; RV32-NEXT:    lw a6, 12(a0)
+; RV32-NEXT:    lw a4, 0(a0)
+; RV32-NEXT:    lw a7, 4(a0)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw t0, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    add a3, a7, a3
+; RV32-NEXT:    add a5, a4, a5
+; RV32-NEXT:    sltu a4, a5, a4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a6, a1
+; RV32-NEXT:    add a4, a2, t0
+; RV32-NEXT:    sltu a2, a4, a2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    sw a4, 8(a0)
+; RV32-NEXT:    sw a5, 0(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    sw a3, 4(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: add_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a2, 8(a0)
+; RV64-NEXT:    ld a3, 0(a0)
+; RV64-NEXT:    ld a4, 0(a1)
+; RV64-NEXT:    ld a1, 8(a1)
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    sd a1, 8(a0)
+; RV64-NEXT:    sd a3, 0(a0)
+; RV64-NEXT:    ret
+  %a = load <2 x i64>, <2 x i64>* %x
+  %b = load <2 x i64>, <2 x i64>* %y
+  %c = add <2 x i64> %a, %b
+  store <2 x i64> %c, <2 x i64>* %x
+  ret void
+}
+
+; This should use LMUL=1 becuase there are no fractional i32 LMULs with ELEN=32
+define void @add_v2i32(<2 x i32>* %x, <2 x i32>* %y) {
+; CHECK-LABEL: add_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, <2 x i32>* %x
+  %b = load <2 x i32>, <2 x i32>* %y
+  %c = add <2 x i32> %a, %b
+  store <2 x i32> %c, <2 x i32>* %x
+  ret void
+}
+
+; i64 vectors should be scalarized
+define void @add_v1i64(<1 x i64>* %x, <1 x i64>* %y) {
+; RV32-LABEL: add_v1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw a2, 4(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: add_v1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a2, 0(a0)
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    ret
+  %a = load <1 x i64>, <1 x i64>* %x
+  %b = load <1 x i64>, <1 x i64>* %y
+  %c = add <1 x i64> %a, %b
+  store <1 x i64> %c, <1 x i64>* %x
+  ret void
+}
+
+; This should use LMUL=1.
+define void @fadd_v4f32(<4 x float>* %x, <4 x float>* %y) {
+; CHECK-LABEL: fadd_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vfadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x float>, <4 x float>* %x
+  %b = load <4 x float>, <4 x float>* %y
+  %c = fadd <4 x float> %a, %b
+  store <4 x float> %c, <4 x float>* %x
+  ret void
+}
+
+; double vectors should be scalarized
+define void @fadd_v2f64(<2 x double>* %x, <2 x double>* %y) {
+; CHECK-LABEL: fadd_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld ft0, 8(a0)
+; CHECK-NEXT:    fld ft1, 0(a0)
+; CHECK-NEXT:    fld ft2, 0(a1)
+; CHECK-NEXT:    fld ft3, 8(a1)
+; CHECK-NEXT:    fadd.d ft1, ft1, ft2
+; CHECK-NEXT:    fadd.d ft0, ft0, ft3
+; CHECK-NEXT:    fsd ft0, 8(a0)
+; CHECK-NEXT:    fsd ft1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x double>, <2 x double>* %x
+  %b = load <2 x double>, <2 x double>* %y
+  %c = fadd <2 x double> %a, %b
+  store <2 x double> %c, <2 x double>* %x
+  ret void
+}
+
+; This should use LMUL=1 becuase there are no fractional float LMULs with ELEN=32
+define void @fadd_v2f32(<2 x float>* %x, <2 x float>* %y) {
+; CHECK-LABEL: fadd_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v25, (a0)
+; CHECK-NEXT:    vle32.v v26, (a1)
+; CHECK-NEXT:    vfadd.vv v25, v25, v26
+; CHECK-NEXT:    vse32.v v25, (a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x float>, <2 x float>* %x
+  %b = load <2 x float>, <2 x float>* %y
+  %c = fadd <2 x float> %a, %b
+  store <2 x float> %c, <2 x float>* %x
+  ret void
+}
+
+; double vectors should be scalarized
+define void @fadd_v1f64(<1 x double>* %x, <1 x double>* %y) {
+; CHECK-LABEL: fadd_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld ft0, 0(a0)
+; CHECK-NEXT:    fld ft1, 0(a1)
+; CHECK-NEXT:    fadd.d ft0, ft0, ft1
+; CHECK-NEXT:    fsd ft0, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <1 x double>, <1 x double>* %x
+  %b = load <1 x double>, <1 x double>* %y
+  %c = fadd <1 x double> %a, %b
+  store <1 x double> %c, <1 x double>* %x
+  ret void
+}