[llvm] 455ed56 - [SVE][CodeGen] Legalisation of INSERT_VECTOR_ELT for scalable vectors

Tue Aug 11 04:58:10 PDT 2020

Author: Kerry McLaughlin
Date: 2020-08-11T12:57:28+01:00
New Revision: 455ed56d48e365f7d095254109abed876dc10c65

URL: https://github.com/llvm/llvm-project/commit/455ed56d48e365f7d095254109abed876dc10c65
DIFF: https://github.com/llvm/llvm-project/commit/455ed56d48e365f7d095254109abed876dc10c65.diff

LOG: [SVE][CodeGen] Legalisation of INSERT_VECTOR_ELT for scalable vectors

When the result type of insertelement needs to be split,
SplitVecRes_INSERT_VECTOR_ELT will try to store the vector to a
stack temporary, store the element at the location of the stack
temporary plus the index, and reload the Lo/Hi parts.

This patch does the following to ensure this works for scalable vectors:
 - Sets the StackID with getStackIDForScalableVectors() in CreateStackTemporary
 - Adds an IsScalable flag to getMemBasePlusOffset() and scales the
    offset by VScale when this is true
 - Ensures the immediate is clamped correctly by clampDynamicVectorIndex
    so that we don't try to use an out of range index

Reviewed By: david-arm

Differential Revision: https://reviews.llvm.org/D84874

Added: 
    llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 55a612bbb01c..4485fb044f34 100644

--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1456,14 +1456,16 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
 
   if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
     unsigned IdxVal = CIdx->getZExtValue();
-    unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
-    if (IdxVal < LoNumElts)
+    unsigned LoNumElts = Lo.getValueType().getVectorMinNumElements();
+    if (IdxVal < LoNumElts) {
       Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
                        Lo.getValueType(), Lo, Elt, Idx);
-    else
+      return;
+    } else if (!Vec.getValueType().isScalableVector()) {
       Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
                        DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
-    return;
+      return;
+    }
   }
 
   // See if the target wants to custom expand this node.
@@ -1476,7 +1478,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   if (VecVT.getScalarSizeInBits() < 8) {
     EltVT = MVT::i8;
     VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
-                             VecVT.getVectorNumElements());
+                             VecVT.getVectorElementCount());
     Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
     // Extend the element type to match if needed.
     if (EltVT.bitsGT(Elt.getValueType()))
@@ -1501,7 +1503,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   Store = DAG.getTruncStore(
       Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
-      commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
+      commonAlignment(SmallestAlign,
+                      EltVT.getSizeInBits().getFixedSize() / 8));
 
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@@ -1510,13 +1513,11 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
 
   // Increment the pointer to the other part.
-  unsigned IncrementSize = LoVT.getSizeInBits() / 8;
-  StackPtr =
-      DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(IncrementSize), dl);
+  auto Load = cast<LoadSDNode>(Lo);
+  MachinePointerInfo MPI = Load->getPointerInfo();
+  IncrementPointer(Load, LoVT, MPI, StackPtr);
 
-  // Load the Hi part from the stack slot.
-  Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
+  Hi = DAG.getLoad(HiVT, dl, Store, StackPtr, MPI, SmallestAlign);
 
   // If we adjusted the original type, we need to truncate the results.
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0095f6299ac6..26c75ab2bb33 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2025,7 +2025,12 @@ Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
 
 SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
   MachineFrameInfo &MFI = MF->getFrameInfo();
-  int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false);
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  int StackID = 0;
+  if (Bytes.isScalable())
+    StackID = TFI->getStackIDForScalableVectors();
+  int FrameIdx = MFI.CreateStackObject(Bytes, Alignment,
+                                       false, nullptr, StackID);
   return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
 }
 
@@ -5937,8 +5942,16 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset,
                                            const SDLoc &DL,
                                            const SDNodeFlags Flags) {
   EVT VT = Base.getValueType();
-  return getMemBasePlusOffset(Base, getConstant(Offset.getFixedSize(), DL, VT),
-                              DL, Flags);
+  SDValue Index;
+
+  if (Offset.isScalable())
+    Index = getVScale(DL, Base.getValueType(),
+                      APInt(Base.getValueSizeInBits().getFixedSize(),
+                            Offset.getKnownMinSize()));
+  else
+    Index = getConstant(Offset.getFixedSize(), DL, VT);
+
+  return getMemBasePlusOffset(Base, Index, DL, Flags);
 }
 
 SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset,

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 911a23c966d6..b289a964606d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7197,16 +7197,26 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
                                        SDValue Idx,
                                        EVT VecVT,
                                        const SDLoc &dl) {
-  if (isa<ConstantSDNode>(Idx))
+  if (!VecVT.isScalableVector() && isa<ConstantSDNode>(Idx))
     return Idx;
 
   EVT IdxVT = Idx.getValueType();
-  unsigned NElts = VecVT.getVectorNumElements();
-  if (isPowerOf2_32(NElts)) {
-    APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
-                                     Log2_32(NElts));
-    return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
-                       DAG.getConstant(Imm, dl, IdxVT));
+  unsigned NElts = VecVT.getVectorMinNumElements();
+  if (VecVT.isScalableVector()) {
+    SDValue VS = DAG.getVScale(dl, IdxVT,
+                               APInt(IdxVT.getSizeInBits().getFixedSize(),
+                                     NElts));
+    SDValue Sub = DAG.getNode(ISD::SUB, dl, IdxVT, VS,
+                              DAG.getConstant(1, dl, IdxVT));
+
+    return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub);
+  } else {
+    if (isPowerOf2_32(NElts)) {
+      APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
+                                       Log2_32(NElts));
+      return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
+                         DAG.getConstant(Imm, dl, IdxVT));
+    }
   }
 
   return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
@@ -7223,8 +7233,8 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
   EVT EltVT = VecVT.getVectorElementType();
 
   // Calculate the element offset and add it to the pointer.
-  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
-  assert(EltSize * 8 == EltVT.getSizeInBits() &&
+  unsigned EltSize = EltVT.getSizeInBits().getFixedSize() / 8; // FIXME: should be ABI size.
+  assert(EltSize * 8 == EltVT.getSizeInBits().getFixedSize() &&
          "Converting bits to bytes lost precision");
 
   Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
new file mode 100644
index 000000000000..5e6dedf4a4cc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+; INSERT VECTOR ELT
+
+define <vscale x 8 x i8> @promote_insert_8i8(<vscale x 8 x i8> %a, i8 %elt, i64 %idx) {
+; CHECK-LABEL: promote_insert_8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.h, w1
+; CHECK-NEXT:    index z2.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z1.h
+; CHECK-NEXT:    mov z0.h, p0/m, w0
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 8 x i8> %a, i8 %elt, i64 %idx
+  ret <vscale x 8 x i8> %ins
+}
+
+define <vscale x 32 x i8> @split_insert_32i8_idx(<vscale x 32 x i8> %a, i8 %elt, i64 %idx) {
+; CHECK-LABEL: split_insert_32i8_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #2
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x1, x8
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    csel x8, x1, x8, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1b { z1.b }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    strb w0, [x9, x8]
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x9, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 %idx
+  ret <vscale x 32 x i8> %ins
+}
+
+define <vscale x 8 x float> @split_insert_8f32_idx(<vscale x 8 x float> %a, float %elt, i64 %idx) {
+; CHECK-LABEL: split_insert_8f32_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x0, x8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    csel x8, x0, x8, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z1.s }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    str s2, [x9, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x9, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 8 x float> %a, float %elt, i64 %idx
+  ret <vscale x 8 x float> %ins
+}
+
+define <vscale x 8 x i64> @split_insert_8i64_idx(<vscale x 8 x i64> %a, i64 %elt, i64 %idx) {
+; CHECK-LABEL: split_insert_8i64_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x1, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    csel x8, x1, x8, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z3.d }, p0, [x9, #3, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x9, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    str x0, [x9, x8, lsl #3]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x9, #1, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x9, #2, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x9, #3, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 8 x i64> %a, i64 %elt, i64 %idx
+  ret <vscale x 8 x i64> %ins
+}
+
+; INSERT VECTOR ELT, CONSTANT IDX
+
+define <vscale x 4 x i16> @promote_insert_4i16(<vscale x 4 x i16> %a, i16 %elt) {
+; CHECK-LABEL: promote_insert_4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    mov z0.s, p0/m, w0
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 4 x i16> %a, i16 %elt, i64 5
+  ret <vscale x 4 x i16> %ins
+}
+
+; In this test, the index is small enough that we know it will be in the
+; low half of the vector and there is no need to go through the stack as
+; done in the remaining tests
+define <vscale x 32 x i8> @split_insert_32i8(<vscale x 32 x i8> %a, i8 %elt) {
+; CHECK-LABEL: split_insert_32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #3
+; CHECK-NEXT:    index z2.b, #0, #1
+; CHECK-NEXT:    mov z3.b, w8
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
+; CHECK-NEXT:    mov z0.b, p0/m, w0
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 3
+  ret <vscale x 32 x i8> %ins
+}
+
+define <vscale x 32 x i16> @split_insert_32i16(<vscale x 32 x i16> %a, i16 %elt) {
+; CHECK-LABEL: split_insert_32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x10, #2
+; CHECK-NEXT:    sub x10, x10, #1 // =1
+; CHECK-NEXT:    mov w9, #128
+; CHECK-NEXT:    cmp x10, #128 // =128
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    csel x9, x10, x9, lo
+; CHECK-NEXT:    st1h { z3.h }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    st1h { z2.h }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    strh w0, [x8, x9, lsl #1]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x8, #2, mul vl]
+; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x8, #3, mul vl]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 32 x i16> %a, i16 %elt, i64 128
+  ret <vscale x 32 x i16> %ins
+}
+
+define <vscale x 8 x i32> @split_insert_8i32(<vscale x 8 x i32> %a, i32 %elt) {
+; CHECK-LABEL: split_insert_8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov w9, #16960
+; CHECK-NEXT:    cnth x10
+; CHECK-NEXT:    movk w9, #15, lsl #16
+; CHECK-NEXT:    sub x10, x10, #1 // =1
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    csel x9, x10, x9, lo
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    str w0, [x8, x9, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 8 x i32> %a, i32 %elt, i64 1000000
+  ret <vscale x 8 x i32> %ins
+}