[llvm] r355326 - Re-commit r355104: "[AArch64][GlobalISel] Add support for 64 bit vector shuffle using TBL1."
Hans Wennborg via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 25 03:09:48 PDT 2019
Oh I think I see what's going on. Patch: https://reviews.llvm.org/D61124
On Wed, Apr 24, 2019 at 3:26 PM Hans Wennborg <hans at chromium.org> wrote:
>
> Hi Amara,
>
> We're seeing link failures about R_AARCH64_LDST128_ABS_LO12_NC
> relocations which require 16-byte alignment referring to 8-byte
> aligned constant pool entries.
>
> The load in question is emitted from
> AArch64InstructionSelector::emitLoadFromConstantPool() which you
> touched here.
>
> emitLoadFromConstantPool() is getting called with a CPVal that looks like this:
> <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1,
> i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
> it gets emitted in the constant pool with 8-byte alignment
>
> but the code seems to emit a load which expects 16-byte alignment.
>
> We didn't notice this until now, because lld was previously
> over-aligning the section.
>
> Details here:
> https://bugs.chromium.org/p/chromium/issues/detail?id=953815
> There's a reproducer in Comment 12.
>
> Could you take a look?
>
> Thanks,
> Hans
>
> On Mon, Mar 4, 2019 at 8:15 PM Amara Emerson via llvm-commits
> <llvm-commits at lists.llvm.org> wrote:
> >
> > Author: aemerson
> > Date: Mon Mar 4 11:16:00 2019
> > New Revision: 355326
> >
> > URL: http://llvm.org/viewvc/llvm-project?rev=355326&view=rev
> > Log:
> > Re-commit r355104: "[AArch64][GlobalISel] Add support for 64 bit vector shuffle using TBL1."
> >
> > The code to materialize a mask from a constant pool load tried to use a 128 bit
> > LDR to load a 64 bit constant pool entry, which was 8 byte aligned. This resulted
> > in a link failure in the NEON tests in the test suite since the LDR address was
> > unaligned. This change fixes that to instead emit a 64 bit LDR if the entry is
> > 64 bit, before converting back to a 128 bit register for the TBL.
> >
> > Modified:
> > llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp
> > llvm/trunk/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir
> >
> > Modified: llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp?rev=355326&r1=355325&r2=355326&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp (original)
> > +++ llvm/trunk/lib/Target/AArch64/AArch64InstructionSelector.cpp Mon Mar 4 11:16:00 2019
> > @@ -67,7 +67,7 @@ private:
> >
> > // Helper to generate an equivalent of scalar_to_vector into a new register,
> > // returned via 'Dst'.
> > - MachineInstr *emitScalarToVector(const LLT DstTy,
> > + MachineInstr *emitScalarToVector(unsigned EltSize,
> > const TargetRegisterClass *DstRC,
> > unsigned Scalar,
> > MachineIRBuilder &MIRBuilder) const;
> > @@ -82,6 +82,8 @@ private:
> > unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
> > MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
> > MachineIRBuilder &MIRBuilder) const;
> > + MachineInstr *emitVectorConcat(unsigned Op1, unsigned Op2,
> > + MachineIRBuilder &MIRBuilder) const;
> >
> > ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
> >
> > @@ -1713,7 +1715,7 @@ bool AArch64InstructionSelector::select(
> > }
> >
> > MachineInstr *AArch64InstructionSelector::emitScalarToVector(
> > - const LLT DstTy, const TargetRegisterClass *DstRC, unsigned Scalar,
> > + unsigned EltSize, const TargetRegisterClass *DstRC, unsigned Scalar,
> > MachineIRBuilder &MIRBuilder) const {
> > auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
> >
> > @@ -1727,7 +1729,7 @@ MachineInstr *AArch64InstructionSelector
> > return &*Ins;
> > };
> >
> > - switch (DstTy.getElementType().getSizeInBits()) {
> > + switch (EltSize) {
> > case 16:
> > return BuildFn(AArch64::hsub);
> > case 32:
> > @@ -1957,13 +1959,123 @@ MachineInstr *AArch64InstructionSelector
> > auto Adrp =
> > MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
> > .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
> > - auto Load =
> > - MIRBuilder.buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
> > - .addConstantPoolIndex(CPIdx, 0,
> > - AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
> > +
> > + MachineInstr *LoadMI = nullptr;
> > + switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) {
> > + case 16:
> > + LoadMI =
> > + &*MIRBuilder
> > + .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
> > + .addConstantPoolIndex(CPIdx, 0,
> > + AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
> > + break;
> > + case 8:
> > + LoadMI = &*MIRBuilder
> > + .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
> > + .addConstantPoolIndex(
> > + CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
> > + break;
> > + default:
> > + LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
> > + << *CPVal->getType());
> > + return nullptr;
> > + }
> > constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
> > - constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
> > - return &*Load;
> > + constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
> > + return LoadMI;
> > +}
> > +
> > +/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
> > +/// size and RB.
> > +static std::pair<unsigned, unsigned>
> > +getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
> > + unsigned Opc, SubregIdx;
> > + if (RB.getID() == AArch64::GPRRegBankID) {
> > + if (EltSize == 32) {
> > + Opc = AArch64::INSvi32gpr;
> > + SubregIdx = AArch64::ssub;
> > + } else if (EltSize == 64) {
> > + Opc = AArch64::INSvi64gpr;
> > + SubregIdx = AArch64::dsub;
> > + } else {
> > + llvm_unreachable("invalid elt size!");
> > + }
> > + } else {
> > + if (EltSize == 8) {
> > + Opc = AArch64::INSvi8lane;
> > + SubregIdx = AArch64::bsub;
> > + } else if (EltSize == 16) {
> > + Opc = AArch64::INSvi16lane;
> > + SubregIdx = AArch64::hsub;
> > + } else if (EltSize == 32) {
> > + Opc = AArch64::INSvi32lane;
> > + SubregIdx = AArch64::ssub;
> > + } else if (EltSize == 64) {
> > + Opc = AArch64::INSvi64lane;
> > + SubregIdx = AArch64::dsub;
> > + } else {
> > + llvm_unreachable("invalid elt size!");
> > + }
> > + }
> > + return std::make_pair(Opc, SubregIdx);
> > +}
> > +
> > +MachineInstr *AArch64InstructionSelector::emitVectorConcat(
> > + unsigned Op1, unsigned Op2, MachineIRBuilder &MIRBuilder) const {
> > + // We implement a vector concat by:
> > + // 1. Use scalar_to_vector to insert the lower vector into the larger dest
> > + // 2. Insert the upper vector into the destination's upper element
> > + // TODO: some of this code is common with G_BUILD_VECTOR handling.
> > + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
> > +
> > + const LLT Op1Ty = MRI.getType(Op1);
> > + const LLT Op2Ty = MRI.getType(Op2);
> > +
> > + if (Op1Ty != Op2Ty) {
> > + LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
> > + return nullptr;
> > + }
> > + assert(Op1Ty.isVector() && "Expected a vector for vector concat");
> > +
> > + if (Op1Ty.getSizeInBits() >= 128) {
> > + LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
> > + return nullptr;
> > + }
> > +
> > + // At the moment we just support 64 bit vector concats.
> > + if (Op1Ty.getSizeInBits() != 64) {
> > + LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
> > + return nullptr;
> > + }
> > +
> > + const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
> > + const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
> > + const TargetRegisterClass *DstRC =
> > + getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
> > +
> > + MachineInstr *WidenedOp1 =
> > + emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
> > + MachineInstr *WidenedOp2 =
> > + emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
> > + if (!WidenedOp1 || !WidenedOp2) {
> > + LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
> > + return nullptr;
> > + }
> > +
> > + // Now do the insert of the upper element.
> > + unsigned InsertOpc, InsSubRegIdx;
> > + std::tie(InsertOpc, InsSubRegIdx) =
> > + getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
> > +
> > + auto InsElt =
> > + MIRBuilder
> > + .buildInstr(InsertOpc, {DstRC}, {WidenedOp1->getOperand(0).getReg()})
> > + .addImm(1) /* Lane index */
> > + .addUse(WidenedOp2->getOperand(0).getReg())
> > + .addImm(0);
> > +
> > + constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
> > + return &*InsElt;
> > }
> >
> > bool AArch64InstructionSelector::selectShuffleVector(
> > @@ -2003,21 +2115,43 @@ bool AArch64InstructionSelector::selectS
> > }
> > }
> >
> > - if (DstTy.getSizeInBits() != 128) {
> > - assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
> > - // This case can be done with TBL1.
> > - return false;
> > - }
> > + MachineIRBuilder MIRBuilder(I);
> >
> > // Use a constant pool to load the index vector for TBL.
> > Constant *CPVal = ConstantVector::get(CstIdxs);
> > - MachineIRBuilder MIRBuilder(I);
> > MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
> > if (!IndexLoad) {
> > LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
> > return false;
> > }
> >
> > + if (DstTy.getSizeInBits() != 128) {
> > + assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
> > + // This case can be done with TBL1.
> > + MachineInstr *Concat = emitVectorConcat(Src1Reg, Src2Reg, MIRBuilder);
> > + if (!Concat) {
> > + LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
> > + return false;
> > + }
> > +
> > + // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
> > + IndexLoad =
> > + emitScalarToVector(64, &AArch64::FPR128RegClass,
> > + IndexLoad->getOperand(0).getReg(), MIRBuilder);
> > +
> > + auto TBL1 = MIRBuilder.buildInstr(
> > + AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
> > + {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
> > + constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
> > +
> > + auto Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
> > + TII.get(TargetOpcode::COPY), I.getOperand(0).getReg())
> > + .addUse(TBL1->getOperand(0).getReg(), 0, AArch64::dsub);
> > + RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
> > + I.eraseFromParent();
> > + return true;
> > + }
> > +
> > // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
> > // Q registers for regalloc.
> > auto RegSeq = MIRBuilder
> > @@ -2049,32 +2183,15 @@ bool AArch64InstructionSelector::selectB
> > const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
> > unsigned Opc;
> > unsigned SubregIdx;
> > - if (RB.getID() == AArch64::GPRRegBankID) {
> > - if (EltSize == 32) {
> > - Opc = AArch64::INSvi32gpr;
> > - SubregIdx = AArch64::ssub;
> > - } else {
> > - Opc = AArch64::INSvi64gpr;
> > - SubregIdx = AArch64::dsub;
> > - }
> > - } else {
> > - if (EltSize == 16) {
> > - Opc = AArch64::INSvi16lane;
> > - SubregIdx = AArch64::hsub;
> > - } else if (EltSize == 32) {
> > - Opc = AArch64::INSvi32lane;
> > - SubregIdx = AArch64::ssub;
> > - } else {
> > - Opc = AArch64::INSvi64lane;
> > - SubregIdx = AArch64::dsub;
> > - }
> > - }
> > +
> > + std::tie(Opc, SubregIdx) = getInsertVecEltOpInfo(RB, EltSize);
> >
> > MachineIRBuilder MIRBuilder(I);
> >
> > const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
> > MachineInstr *ScalarToVec =
> > - emitScalarToVector(DstTy, DstRC, I.getOperand(1).getReg(), MIRBuilder);
> > + emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
> > + I.getOperand(1).getReg(), MIRBuilder);
> > if (!ScalarToVec)
> > return false;
> >
> >
> > Modified: llvm/trunk/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir
> > URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir?rev=355326&r1=355325&r2=355326&view=diff
> > ==============================================================================
> > --- llvm/trunk/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir (original)
> > +++ llvm/trunk/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir Mon Mar 4 11:16:00 2019
> > @@ -1,11 +1,17 @@
> > # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
> > +# WARNING: update_mir_test_checks.py does not include the constant pools output,
> > +# so this test requires manual fixing up after running the script.
> > +
> > # RUN: llc -mtriple=aarch64-- -O0 -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s
> > --- |
> > - ; ModuleID = 'shufflevec-only-legal.ll'
> > - source_filename = "shufflevec-only-legal.ll"
> > target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
> > target triple = "aarch64"
> >
> > + define <2 x float> @shuffle_v2f32(<2 x float> %a, <2 x float> %b) {
> > + %shuf = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 0>
> > + ret <2 x float> %shuf
> > + }
> > +
> > define <4 x i32> @shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
> > %shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
> > ret <4 x i32> %shuf
> > @@ -23,19 +29,52 @@
> >
> > ...
> > ---
> > +name: shuffle_v2f32
> > +alignment: 2
> > +legalized: true
> > +regBankSelected: true
> > +tracksRegLiveness: true
> > +body: |
> > + bb.1 (%ir-block.0):
> > + liveins: $d0, $d1
> > +
> > + ; CHECK-LABEL: name: shuffle_v2f32
> > + ; CHECK: constants:
> > + ; CHECK: - id: 0
> > + ; CHECK: value: '<8 x i8> <i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3>'
> > + ; CHECK: alignment: 8
> > + ; CHECK: liveins: $d0, $d1
> > + ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
> > + ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
> > + ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
> > + ; CHECK: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
> > + ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
> > + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
> > + ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
> > + ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub
> > + ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
> > + ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
> > + ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[LDRDui]], %subreg.dsub
> > + ; CHECK: [[TBLv16i8One:%[0-9]+]]:fpr128 = TBLv16i8One [[INSvi64lane]], [[INSERT_SUBREG2]]
> > + ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY [[TBLv16i8One]].dsub
> > + ; CHECK: $d0 = COPY [[COPY2]]
> > + ; CHECK: RET_ReallyLR implicit $d0
> > + %0:fpr(<2 x s32>) = COPY $d0
> > + %1:fpr(<2 x s32>) = COPY $d1
> > + %4:gpr(s32) = G_CONSTANT i32 1
> > + %5:gpr(s32) = G_CONSTANT i32 0
> > + %3:fpr(<2 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32)
> > + %2:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1, %3(<2 x s32>)
> > + $d0 = COPY %2(<2 x s32>)
> > + RET_ReallyLR implicit $d0
> > +
> > +...
> > +---
> > name: shuffle_v4i32
> > alignment: 2
> > legalized: true
> > regBankSelected: true
> > tracksRegLiveness: true
> > -registers:
> > - - { id: 0, class: fpr }
> > - - { id: 1, class: fpr }
> > - - { id: 2, class: fpr }
> > - - { id: 3, class: fpr }
> > - - { id: 4, class: gpr }
> > - - { id: 5, class: gpr }
> > - - { id: 6, class: gpr }
> > body: |
> > bb.1 (%ir-block.0):
> > liveins: $q0, $q1
> > @@ -71,15 +110,6 @@ alignment: 2
> > legalized: true
> > regBankSelected: true
> > tracksRegLiveness: true
> > -registers:
> > - - { id: 0, class: fpr }
> > - - { id: 1, class: fpr }
> > - - { id: 2, class: fpr }
> > - - { id: 3, class: fpr }
> > - - { id: 4, class: gpr }
> > - - { id: 5, class: gpr }
> > - - { id: 6, class: gpr }
> > - - { id: 7, class: gpr }
> > body: |
> > bb.1 (%ir-block.0):
> > liveins: $q0, $q1
> > @@ -116,12 +146,6 @@ alignment: 2
> > legalized: true
> > regBankSelected: true
> > tracksRegLiveness: true
> > -registers:
> > - - { id: 0, class: fpr }
> > - - { id: 1, class: fpr }
> > - - { id: 2, class: fpr }
> > - - { id: 3, class: fpr }
> > - - { id: 4, class: gpr }
> > body: |
> > bb.1 (%ir-block.0):
> > liveins: $q0, $q1
> >
> >
> > _______________________________________________
> > llvm-commits mailing list
> > llvm-commits at lists.llvm.org
> > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list