[llvm] 0035dec - [CodeGen] Fix issues with scalable-vector INSERT/EXTRACT_SUBVECTORs

Mon Mar 15 10:11:28 PDT 2021

Author: Fraser Cormack
Date: 2021-03-15T17:04:21Z
New Revision: 0035decae7ab9ab1c988fdcede46598540afd1a0

URL: https://github.com/llvm/llvm-project/commit/0035decae7ab9ab1c988fdcede46598540afd1a0
DIFF: https://github.com/llvm/llvm-project/commit/0035decae7ab9ab1c988fdcede46598540afd1a0.diff

LOG: [CodeGen] Fix issues with scalable-vector INSERT/EXTRACT_SUBVECTORs

This patch addresses a few issues when dealing with scalable-vector
INSERT_SUBVECTOR and EXTRACT_SUBVECTOR nodes.

When legalizing in DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR, we
store the low and high halves to the stack separately. The offset for
the high half was calculated incorrectly.

Additionally, we can optimize this process when we can detect that the
subvector is contained entirely within the low/high split vector type.
While this optimization is valid on scalable vectors, when performing
the 'high' optimization, the subvector must also be a scalable vector.
Note that the 'low' optimization is still conservative: it may be
possible to insert v2i32 into the low half of a split nxv1i32/nxv1i32,
but we can't guarantee it. It is always possible to insert v2i32 into
nxv2i32 or v2i32 into nxv4i32+2 as we know vscale is at least 1.

Lastly, in SelectionDAG::isSplatValue, we early-exit on the extracted subvector value
type being a scalable vector, forgetting that we can also extract a
fixed-length vector from a scalable one.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D98495

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
    llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 92d9daa99c9f..5843e7396818 100644

--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1273,20 +1273,24 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
 
   EVT VecVT = Vec.getValueType();
   EVT LoVT = Lo.getValueType();
-  unsigned VecElems = VecVT.getVectorNumElements();
-  unsigned SubElems = SubVec.getValueType().getVectorNumElements();
-  unsigned LoElems = LoVT.getVectorNumElements();
+  EVT SubVecVT = SubVec.getValueType();
+  unsigned VecElems = VecVT.getVectorMinNumElements();
+  unsigned SubElems = SubVecVT.getVectorMinNumElements();
+  unsigned LoElems = LoVT.getVectorMinNumElements();
 
   // If we know the index is in the first half, and we know the subvector
   // doesn't cross the boundary between the halves, we can avoid spilling the
   // vector, and insert into the lower half of the split vector directly.
-  // Similarly if the subvector is fully in the high half.
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   if (IdxVal + SubElems <= LoElems) {
     Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
     return;
   }
-  if (IdxVal >= LoElems && IdxVal + SubElems <= VecElems) {
+  // Similarly if the subvector is fully in the high half, but mind that we
+  // can't tell whether a fixed-length subvector is fully within the high half
+  // of a scalable vector.
+  if (VecVT.isScalableVector() == SubVecVT.isScalableVector() &&
+      IdxVal >= LoElems && IdxVal + SubElems <= VecElems) {
     Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, Hi.getValueType(), Hi, SubVec,
                      DAG.getVectorIdxConstant(IdxVal - LoElems, dl));
     return;
@@ -1315,13 +1319,12 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
                    SmallestAlign);
 
   // Increment the pointer to the other part.
-  unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
-  StackPtr =
-      DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(IncrementSize), dl);
+  auto *Load = cast<LoadSDNode>(Lo);
+  MachinePointerInfo MPI = Load->getPointerInfo();
+  IncrementPointer(Load, LoVT, MPI, StackPtr);
 
   // Load the Hi part from the stack slot.
-  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MPI, SmallestAlign);
 }
 
 void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2ad3ada7fc04..da891e1c2425 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2524,6 +2524,9 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
   case ISD::EXTRACT_SUBVECTOR: {
     // Offset the demanded elts by the subvector index.
     SDValue Src = V.getOperand(0);
+    // We don't support scalable vectors at the moment.
+    if (Src.getValueType().isScalableVector())
+      return false;
     uint64_t Idx = V.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
     APInt UndefSrcElts;

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 00ab5a70b946..194fe3400e6f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -553,6 +553,99 @@ define <vscale x 8 x i1> @insert_nxv8i1_v8i1_16(<vscale x 8 x i1> %v, <8 x i1>*
   ret <vscale x 8 x i1> %c
 }
 
+declare <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
+
+define void @insert_v2i64_nxv16i64(<2 x i64>* %psv0, <2 x i64>* %psv1, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_v2i64_nxv16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vsetivli a0, 6, e64,m8,tu,mu
+; CHECK-NEXT:    vslideup.vi v8, v16, 4
+; CHECK-NEXT:    vs8r.v v8, (a2)
+; CHECK-NEXT:    ret
+  %sv0 = load <2 x i64>, <2 x i64>* %psv0
+  %sv1 = load <2 x i64>, <2 x i64>* %psv1
+  %v0 = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> %v0, <2 x i64> %sv1, i64 4)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+define void @insert_v2i64_nxv16i64_lo0(<2 x i64>* %psv, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v8, (a1)
+; CHECK-NEXT:    ret
+  %sv = load <2 x i64>, <2 x i64>* %psv
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+define void @insert_v2i64_nxv16i64_lo2(<2 x i64>* %psv, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_v2i64_nxv16i64_lo2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vsetivli a0, 4, e64,m8,ta,mu
+; CHECK-NEXT:    vslideup.vi v16, v8, 2
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    ret
+  %sv = load <2 x i64>, <2 x i64>* %psv
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+; Check we don't mistakenly optimize this: we don't know whether this is
+; inserted into the low or high split vector.
+define void @insert_v2i64_nxv16i64_hi(<2 x i64>* %psv, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_v2i64_nxv16i64_hi:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    sub sp, sp, a2
+; CHECK-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vle64.v v25, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a2, a0, 4
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    addi a3, zero, 8
+; CHECK-NEXT:    bltu a2, a3, .LBB29_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    addi a2, zero, 8
+; CHECK-NEXT:  .LBB29_2:
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vsetivli a4, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vse64.v v25, (a2)
+; CHECK-NEXT:    slli a0, a0, 6
+; CHECK-NEXT:    add a2, a3, a0
+; CHECK-NEXT:    vl8re64.v v8, (a2)
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8re64.v v16, (a2)
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %sv = load <2 x i64>, <2 x i64>* %psv
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
 declare <8 x i1> @llvm.experimental.vector.insert.v4i1.v8i1(<8 x i1>, <4 x i1>, i64)
 declare <32 x i1> @llvm.experimental.vector.insert.v8i1.v32i1(<32 x i1>, <8 x i1>, i64)
 

diff  --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index 206fcc7923bd..e1e46fb6610d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -458,6 +458,46 @@ define <vscale x 4 x i1> @insert_nxv4i1_nxv1i1_2(<vscale x 4 x i1> %v, <vscale x
   ret <vscale x 4 x i1> %vec
 }
 
+declare <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64)
+
+define void @insert_nxv8i64_nxv16i64(<vscale x 8 x i64> %sv0, <vscale x 8 x i64> %sv1, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_nxv8i64_nxv16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vs8r.v v16, (a0)
+; CHECK-NEXT:    ret
+  %v0 = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0)
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> %v0, <vscale x 8 x i64> %sv1, i64 8)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+define void @insert_nxv8i64_nxv16i64_lo(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_nxv8i64_nxv16i64_lo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+define void @insert_nxv8i64_nxv16i64_hi(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_nxv8i64_nxv16i64_hi:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 8)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
 declare <vscale x 4 x i1> @llvm.experimental.vector.insert.nxv1i1.nxv4i1(<vscale x 4 x i1>, <vscale x 1 x i1>, i64)
 declare <vscale x 32 x i1> @llvm.experimental.vector.insert.nxv8i1.nxv32i1(<vscale x 32 x i1>, <vscale x 8 x i1>, i64)