[llvm] 455ed56 - [SVE][CodeGen] Legalisation of INSERT_VECTOR_ELT for scalable vectors
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 11 04:58:10 PDT 2020
Author: Kerry McLaughlin
Date: 2020-08-11T12:57:28+01:00
New Revision: 455ed56d48e365f7d095254109abed876dc10c65
URL: https://github.com/llvm/llvm-project/commit/455ed56d48e365f7d095254109abed876dc10c65
DIFF: https://github.com/llvm/llvm-project/commit/455ed56d48e365f7d095254109abed876dc10c65.diff
LOG: [SVE][CodeGen] Legalisation of INSERT_VECTOR_ELT for scalable vectors
When the result type of insertelement needs to be split,
SplitVecRes_INSERT_VECTOR_ELT will try to store the vector to a
stack temporary, store the element at the location of the stack
temporary plus the index, and reload the Lo/Hi parts.
This patch does the following to ensure this works for scalable vectors:
- Sets the StackID with getStackIDForScalableVectors() in CreateStackTemporary
- Adds an IsScalable flag to getMemBasePlusOffset() and scales the
offset by VScale when this is true
- Ensures the immediate is clamped correctly by clampDynamicVectorIndex
so that we don't try to use an out of range index
Reviewed By: david-arm
Differential Revision: https://reviews.llvm.org/D84874
Added:
llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 55a612bbb01c..4485fb044f34 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1456,14 +1456,16 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
unsigned IdxVal = CIdx->getZExtValue();
- unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
- if (IdxVal < LoNumElts)
+ unsigned LoNumElts = Lo.getValueType().getVectorMinNumElements();
+ if (IdxVal < LoNumElts) {
Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
Lo.getValueType(), Lo, Elt, Idx);
- else
+ return;
+ } else if (!Vec.getValueType().isScalableVector()) {
Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
- return;
+ return;
+ }
}
// See if the target wants to custom expand this node.
@@ -1476,7 +1478,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
if (VecVT.getScalarSizeInBits() < 8) {
EltVT = MVT::i8;
VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
- VecVT.getVectorNumElements());
+ VecVT.getVectorElementCount());
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
// Extend the element type to match if needed.
if (EltVT.bitsGT(Elt.getValueType()))
@@ -1501,7 +1503,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
Store = DAG.getTruncStore(
Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
- commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
+ commonAlignment(SmallestAlign,
+ EltVT.getSizeInBits().getFixedSize() / 8));
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@@ -1510,13 +1513,11 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
// Increment the pointer to the other part.
- unsigned IncrementSize = LoVT.getSizeInBits() / 8;
- StackPtr =
- DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(IncrementSize), dl);
+ auto Load = cast<LoadSDNode>(Lo);
+ MachinePointerInfo MPI = Load->getPointerInfo();
+ IncrementPointer(Load, LoVT, MPI, StackPtr);
- // Load the Hi part from the stack slot.
- Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
- PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
+ Hi = DAG.getLoad(HiVT, dl, Store, StackPtr, MPI, SmallestAlign);
// If we adjusted the original type, we need to truncate the results.
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0095f6299ac6..26c75ab2bb33 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2025,7 +2025,12 @@ Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
MachineFrameInfo &MFI = MF->getFrameInfo();
- int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false);
+ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+ int StackID = 0;
+ if (Bytes.isScalable())
+ StackID = TFI->getStackIDForScalableVectors();
+ int FrameIdx = MFI.CreateStackObject(Bytes, Alignment,
+ false, nullptr, StackID);
return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
}
@@ -5937,8 +5942,16 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset,
const SDLoc &DL,
const SDNodeFlags Flags) {
EVT VT = Base.getValueType();
- return getMemBasePlusOffset(Base, getConstant(Offset.getFixedSize(), DL, VT),
- DL, Flags);
+ SDValue Index;
+
+ if (Offset.isScalable())
+ Index = getVScale(DL, Base.getValueType(),
+ APInt(Base.getValueSizeInBits().getFixedSize(),
+ Offset.getKnownMinSize()));
+ else
+ Index = getConstant(Offset.getFixedSize(), DL, VT);
+
+ return getMemBasePlusOffset(Base, Index, DL, Flags);
}
SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 911a23c966d6..b289a964606d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7197,16 +7197,26 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
SDValue Idx,
EVT VecVT,
const SDLoc &dl) {
- if (isa<ConstantSDNode>(Idx))
+ if (!VecVT.isScalableVector() && isa<ConstantSDNode>(Idx))
return Idx;
EVT IdxVT = Idx.getValueType();
- unsigned NElts = VecVT.getVectorNumElements();
- if (isPowerOf2_32(NElts)) {
- APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
- Log2_32(NElts));
- return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
- DAG.getConstant(Imm, dl, IdxVT));
+ unsigned NElts = VecVT.getVectorMinNumElements();
+ if (VecVT.isScalableVector()) {
+ SDValue VS = DAG.getVScale(dl, IdxVT,
+ APInt(IdxVT.getSizeInBits().getFixedSize(),
+ NElts));
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, IdxVT, VS,
+ DAG.getConstant(1, dl, IdxVT));
+
+ return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub);
+ } else {
+ if (isPowerOf2_32(NElts)) {
+ APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
+ Log2_32(NElts));
+ return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
+ DAG.getConstant(Imm, dl, IdxVT));
+ }
}
return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
@@ -7223,8 +7233,8 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
EVT EltVT = VecVT.getVectorElementType();
// Calculate the element offset and add it to the pointer.
- unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
- assert(EltSize * 8 == EltVT.getSizeInBits() &&
+ unsigned EltSize = EltVT.getSizeInBits().getFixedSize() / 8; // FIXME: should be ABI size.
+ assert(EltSize * 8 == EltVT.getSizeInBits().getFixedSize() &&
"Converting bits to bytes lost precision");
Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);
diff --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
new file mode 100644
index 000000000000..5e6dedf4a4cc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+; INSERT VECTOR ELT
+
+define <vscale x 8 x i8> @promote_insert_8i8(<vscale x 8 x i8> %a, i8 %elt, i64 %idx) {
+; CHECK-LABEL: promote_insert_8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.h, w1
+; CHECK-NEXT: index z2.h, #0, #1
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z1.h
+; CHECK-NEXT: mov z0.h, p0/m, w0
+; CHECK-NEXT: ret
+ %ins = insertelement <vscale x 8 x i8> %a, i8 %elt, i64 %idx
+ ret <vscale x 8 x i8> %ins
+}
+
+define <vscale x 32 x i8> @split_insert_32i8_idx(<vscale x 32 x i8> %a, i8 %elt, i64 %idx) {
+; CHECK-LABEL: split_insert_32i8_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #2
+; CHECK-NEXT: sub x8, x8, #1 // =1
+; CHECK-NEXT: cmp x1, x8
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: csel x8, x1, x8, lo
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: st1b { z1.b }, p0, [x9, #1, mul vl]
+; CHECK-NEXT: st1b { z0.b }, p0, [sp]
+; CHECK-NEXT: strb w0, [x9, x8]
+; CHECK-NEXT: ld1b { z1.b }, p0/z, [x9, #1, mul vl]
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 %idx
+ ret <vscale x 32 x i8> %ins
+}
+
+define <vscale x 8 x float> @split_insert_8f32_idx(<vscale x 8 x float> %a, float %elt, i64 %idx) {
+; CHECK-LABEL: split_insert_8f32_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: sub x8, x8, #1 // =1
+; CHECK-NEXT: cmp x0, x8
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: csel x8, x0, x8, lo
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: st1w { z1.s }, p0, [x9, #1, mul vl]
+; CHECK-NEXT: st1w { z0.s }, p0, [sp]
+; CHECK-NEXT: str s2, [x9, x8, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x9, #1, mul vl]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ins = insertelement <vscale x 8 x float> %a, float %elt, i64 %idx
+ ret <vscale x 8 x float> %ins
+}
+
+define <vscale x 8 x i64> @split_insert_8i64_idx(<vscale x 8 x i64> %a, i64 %elt, i64 %idx) {
+; CHECK-LABEL: split_insert_8i64_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-4
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: sub x8, x8, #1 // =1
+; CHECK-NEXT: cmp x1, x8
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: csel x8, x1, x8, lo
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: st1d { z3.d }, p0, [x9, #3, mul vl]
+; CHECK-NEXT: st1d { z2.d }, p0, [x9, #2, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [x9, #1, mul vl]
+; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: str x0, [x9, x8, lsl #3]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9, #1, mul vl]
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [x9, #2, mul vl]
+; CHECK-NEXT: ld1d { z3.d }, p0/z, [x9, #3, mul vl]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #4
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ins = insertelement <vscale x 8 x i64> %a, i64 %elt, i64 %idx
+ ret <vscale x 8 x i64> %ins
+}
+
+; INSERT VECTOR ELT, CONSTANT IDX
+
+define <vscale x 4 x i16> @promote_insert_4i16(<vscale x 4 x i16> %a, i16 %elt) {
+; CHECK-LABEL: promote_insert_4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #5
+; CHECK-NEXT: index z1.s, #0, #1
+; CHECK-NEXT: mov z2.s, w8
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s
+; CHECK-NEXT: mov z0.s, p0/m, w0
+; CHECK-NEXT: ret
+ %ins = insertelement <vscale x 4 x i16> %a, i16 %elt, i64 5
+ ret <vscale x 4 x i16> %ins
+}
+
+; In this test, the index is small enough that we know it will be in the
+; low half of the vector and there is no need to go through the stack as
+; done in the remaining tests
+define <vscale x 32 x i8> @split_insert_32i8(<vscale x 32 x i8> %a, i8 %elt) {
+; CHECK-LABEL: split_insert_32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #3
+; CHECK-NEXT: index z2.b, #0, #1
+; CHECK-NEXT: mov z3.b, w8
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b
+; CHECK-NEXT: mov z0.b, p0/m, w0
+; CHECK-NEXT: ret
+ %ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 3
+ ret <vscale x 32 x i8> %ins
+}
+
+define <vscale x 32 x i16> @split_insert_32i16(<vscale x 32 x i16> %a, i16 %elt) {
+; CHECK-LABEL: split_insert_32i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-4
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x10, #2
+; CHECK-NEXT: sub x10, x10, #1 // =1
+; CHECK-NEXT: mov w9, #128
+; CHECK-NEXT: cmp x10, #128 // =128
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: csel x9, x10, x9, lo
+; CHECK-NEXT: st1h { z3.h }, p0, [x8, #3, mul vl]
+; CHECK-NEXT: st1h { z2.h }, p0, [x8, #2, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl]
+; CHECK-NEXT: st1h { z0.h }, p0, [sp]
+; CHECK-NEXT: strh w0, [x8, x9, lsl #1]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8, #2, mul vl]
+; CHECK-NEXT: ld1h { z3.h }, p0/z, [x8, #3, mul vl]
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #4
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ins = insertelement <vscale x 32 x i16> %a, i16 %elt, i64 128
+ ret <vscale x 32 x i16> %ins
+}
+
+define <vscale x 8 x i32> @split_insert_8i32(<vscale x 8 x i32> %a, i32 %elt) {
+; CHECK-LABEL: split_insert_8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: mov w9, #16960
+; CHECK-NEXT: cnth x10
+; CHECK-NEXT: movk w9, #15, lsl #16
+; CHECK-NEXT: sub x10, x10, #1 // =1
+; CHECK-NEXT: cmp x10, x9
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: csel x9, x10, x9, lo
+; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl]
+; CHECK-NEXT: st1w { z0.s }, p0, [sp]
+; CHECK-NEXT: str w0, [x8, x9, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ins = insertelement <vscale x 8 x i32> %a, i32 %elt, i64 1000000
+ ret <vscale x 8 x i32> %ins
+}
More information about the llvm-commits
mailing list