[llvm] r361021 - AMDGPU/GlobalISel: Use waterfall loop for buffer_load
Mikael Holmén via llvm-commits
llvm-commits at lists.llvm.org
Mon May 20 05:59:13 PDT 2019
Hi,
Old clang versions (e.g. 3.6.0) warns on this patch:
../lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp:163:51: error: suggest
braces around initialization of subobject [-Werror,-Wmissing-braces]
const std::array<unsigned, 3> RegSrcOpIdx = { 2, 3, 4 };
^~~~~~~
{ }
../lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp:182:51: error: suggest
braces around initialization of subobject [-Werror,-Wmissing-braces]
const std::array<unsigned, 2> RegSrcOpIdx = { 2, 3 };
^~~~
{ }
2 errors generated.
I think it's due to this bug that was fixed in clang 6.0:
https://bugs.llvm.org/show_bug.cgi?id=21629
Not sure if anything could/should be done on trunk to the code to make
it compile without warnings also with old clang versions?
/Mikael
On 5/17/19 2:02 PM, Matt Arsenault via llvm-commits wrote:
> Author: arsenm
> Date: Fri May 17 05:02:27 2019
> New Revision: 361021
>
> URL: http://llvm.org/viewvc/llvm-project?rev=361021&view=rev
> Log:
> AMDGPU/GlobalISel: Use waterfall loop for buffer_load
>
> This adds support for more complex waterfall loops that need to handle
> operands > 32-bits, and multiple operands.
>
> Added:
> llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-buffer-load.mir
> Modified:
> llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
> llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
> llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp?rev=361021&r1=361020&r2=361021&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp Fri May 17 05:02:27 2019
> @@ -105,6 +105,69 @@ const RegisterBank &AMDGPURegisterBankIn
> return getRegBank(AMDGPU::VGPRRegBankID);
> }
>
> +template <unsigned NumOps>
> +RegisterBankInfo::InstructionMappings
> +AMDGPURegisterBankInfo::addMappingFromTable(
> + const MachineInstr &MI, const MachineRegisterInfo &MRI,
> + const std::array<unsigned, NumOps> RegSrcOpIdx,
> + ArrayRef<OpRegBankEntry<NumOps>> Table) const {
> +
> + InstructionMappings AltMappings;
> +
> + SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
> +
> + unsigned Sizes[NumOps];
> + for (unsigned I = 0; I < NumOps; ++I) {
> + unsigned Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
> + Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
> + }
> +
> + for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
> + unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
> + Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
> + }
> +
> + unsigned MappingID = 0;
> + for (const auto &Entry : Table) {
> + for (unsigned I = 0; I < NumOps; ++I) {
> + int OpIdx = RegSrcOpIdx[I];
> + Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
> + }
> +
> + AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
> + getOperandsMapping(Operands),
> + Operands.size()));
> + }
> +
> + return AltMappings;
> +}
> +
> +RegisterBankInfo::InstructionMappings
> +AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
> + const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
> +
> + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
> + case Intrinsic::amdgcn_buffer_load: {
> + static const OpRegBankEntry<3> Table[4] = {
> + // Perfectly legal.
> + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
> + { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
> +
> + // Waterfall loop needed for rsrc. In the worst case this will execute
> + // approximately an extra 10 * wavesize + 2 instructions.
> + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
> + { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
> + };
> +
> + // rsrc, voffset, offset
> + const std::array<unsigned, 3> RegSrcOpIdx = { 2, 3, 4 };
> + return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
> + }
> + default:
> + return RegisterBankInfo::getInstrAlternativeMappings(MI);
> + }
> +}
> +
> RegisterBankInfo::InstructionMappings
> AMDGPURegisterBankInfo::getInstrAlternativeMappings(
> const MachineInstr &MI) const {
> @@ -283,6 +346,8 @@ AMDGPURegisterBankInfo::getInstrAlternat
> AltMappings.push_back(&VMapping);
> return AltMappings;
> }
> + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
> + return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
> default:
> break;
> }
> @@ -330,10 +395,24 @@ static LLT getHalfSizedType(LLT Ty) {
> return LLT::scalar(Ty.getSizeInBits() / 2);
> }
>
> -/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
> +/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
> /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
> /// execute the instruction for each unique combination of values in all lanes
> -/// in the wave. The block will be split such that new blocks
> +/// in the wave. The block will be split such that rest of the instructions are
> +/// moved to a new block.
> +///
> +/// Essentially performs this loop:
> +//
> +/// Save Execution Mask
> +/// For (Lane : Wavefront) {
> +/// Enable Lane, Disable all other lanes
> +/// SGPR = read SGPR value for current lane from VGPR
> +/// VGPRResult[Lane] = use_op SGPR
> +/// }
> +/// Restore Execution Mask
> +///
> +/// There is additional complexity to try for compare values to identify the
> +/// unique values used.
> void AMDGPURegisterBankInfo::executeInWaterfallLoop(
> MachineInstr &MI, MachineRegisterInfo &MRI,
> ArrayRef<unsigned> OpIndices) const {
> @@ -345,9 +424,6 @@ void AMDGPURegisterBankInfo::executeInWa
> MachineBasicBlock &MBB = *MI.getParent();
> const DebugLoc &DL = MI.getDebugLoc();
>
> - assert(OpIndices.size() == 1 &&
> - "need to implement support for multiple operands");
> -
> // Use a set to avoid extra readfirstlanes in the case where multiple operands
> // are the same register.
> SmallSet<unsigned, 4> SGPROperandRegs;
> @@ -386,13 +462,8 @@ void AMDGPURegisterBankInfo::executeInWa
> B.buildInstr(TargetOpcode::IMPLICIT_DEF)
> .addDef(InitSaveExecReg);
>
> - // Save the EXEC mask
> - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
> - .addReg(AMDGPU::EXEC);
> -
> unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
> unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
> - unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
>
> // To insert the loop we need to split the block. Move everything before this
> // point to a new block, and insert a new empty block before this instruction.
> @@ -437,37 +508,172 @@ void AMDGPURegisterBankInfo::executeInWa
> LoopBB->splice(LoopBB->end(), &MBB, I);
> I = std::prev(LoopBB->end());
>
> + B.setInstr(*I);
> +
> + unsigned CondReg = AMDGPU::NoRegister;
> +
> for (MachineOperand &Op : MI.uses()) {
> if (!Op.isReg())
> continue;
>
> assert(!Op.isDef());
> if (SGPROperandRegs.count(Op.getReg())) {
> - unsigned CurrentLaneOpReg
> - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
> - MRI.setType(CurrentLaneOpReg, LLT::scalar(32)); // FIXME
> -
> - assert(MRI.getType(Op.getReg())== LLT::scalar(32) &&
> - "need to implement support for other types");
> -
> - constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
> -
> - // Read the next variant <- also loop target.
> - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
> - CurrentLaneOpReg)
> - .addReg(Op.getReg());
> -
> - // FIXME: Need to and each conditon
> -
> - // Compare the just read SGPR value to all possible operand values.
> - B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
> - .addDef(CondReg)
> - .addReg(CurrentLaneOpReg)
> - .addReg(Op.getReg());
> - Op.setReg(CurrentLaneOpReg);
> + LLT OpTy = MRI.getType(Op.getReg());
> + unsigned OpSize = OpTy.getSizeInBits();
> +
> + // Can only do a readlane of 32-bit pieces.
> + if (OpSize == 32) {
> + // Avoid extra copies in the simple case of one 32-bit register.
> + unsigned CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
> + MRI.setType(CurrentLaneOpReg, OpTy);
> +
> + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
> + // Read the next variant <- also loop target.
> + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
> + .addReg(Op.getReg());
> +
> + unsigned NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
> + bool First = CondReg == AMDGPU::NoRegister;
> + if (First)
> + CondReg = NewCondReg;
> +
> + // Compare the just read M0 value to all possible Idx values.
> + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
> + .addDef(NewCondReg)
> + .addReg(CurrentLaneOpReg)
> + .addReg(Op.getReg());
> + Op.setReg(CurrentLaneOpReg);
> +
> + if (!First) {
> + unsigned AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
> +
> + // If there are multiple operands to consider, and the conditions.
> + B.buildInstr(AMDGPU::S_AND_B64)
> + .addDef(AndReg)
> + .addReg(NewCondReg)
> + .addReg(CondReg);
> + CondReg = AndReg;
> + }
> + } else {
> + LLT S32 = LLT::scalar(32);
> + SmallVector<unsigned, 8> ReadlanePieces;
> +
> + // The compares can be done as 64-bit, but the extract needs to be done
> + // in 32-bit pieces.
> +
> + bool Is64 = OpSize % 64 == 0;
> +
> + LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
> + unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
> + : AMDGPU::V_CMP_EQ_U32_e64;
> +
> + // The compares can be done as 64-bit, but the extract needs to be done
> + // in 32-bit pieces.
> +
> + // Insert the unmerge before the loop.
> +
> + B.setMBB(MBB);
> + auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
> + B.setInstr(*I);
> +
> + unsigned NumPieces = Unmerge->getNumOperands() - 1;
> + for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
> + unsigned UnmergePiece = Unmerge.getReg(PieceIdx);
> +
> + unsigned CurrentLaneOpReg;
> + if (Is64) {
> + unsigned CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
> + unsigned CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
> +
> + MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
> + MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
> + MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
> +
> + // FIXME: Should be able to just use a subreg index here.
> + auto Unmerge32 = B.buildUnmerge(S32, UnmergePiece);
> +
> + MRI.setRegClass(Unmerge32.getReg(0), &AMDGPU::VGPR_32RegClass);
> + MRI.setRegClass(Unmerge32.getReg(1), &AMDGPU::VGPR_32RegClass);
> +
> + // Read the next variant <- also loop target.
> + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
> + CurrentLaneOpRegLo)
> + .addReg(Unmerge32.getReg(0));
> +
> + // Read the next variant <- also loop target.
> + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
> + CurrentLaneOpRegHi)
> + .addReg(Unmerge32.getReg(1));
> +
> + CurrentLaneOpReg =
> + B.buildMerge(LLT::scalar(64),
> + {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
> + .getReg(0);
> +
> + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
> +
> + if (OpTy.getScalarSizeInBits() == 64) {
> + // If we need to produce a 64-bit element vector, so use the
> + // merged pieces
> + ReadlanePieces.push_back(CurrentLaneOpReg);
> + } else {
> + // 32-bit element type.
> + ReadlanePieces.push_back(CurrentLaneOpRegLo);
> + ReadlanePieces.push_back(CurrentLaneOpRegHi);
> + }
> + } else {
> + CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
> + MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
> + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
> +
> + // Read the next variant <- also loop target.
> + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
> + CurrentLaneOpReg)
> + .addReg(UnmergePiece);
> + ReadlanePieces.push_back(CurrentLaneOpReg);
> + }
> +
> + unsigned NewCondReg
> + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
> + bool First = CondReg == AMDGPU::NoRegister;
> + if (First)
> + CondReg = NewCondReg;
> +
> + B.buildInstr(CmpOp)
> + .addDef(NewCondReg)
> + .addReg(CurrentLaneOpReg)
> + .addReg(UnmergePiece);
> +
> + if (!First) {
> + unsigned AndReg
> + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
> +
> + // If there are multiple operands to consider, and the conditions.
> + B.buildInstr(AMDGPU::S_AND_B64)
> + .addDef(AndReg)
> + .addReg(NewCondReg)
> + .addReg(CondReg);
> + CondReg = AndReg;
> + }
> + }
> +
> + // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
> + // BUILD_VECTOR
> + if (OpTy.isVector()) {
> + auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
> + Op.setReg(Merge.getReg(0));
> + } else {
> + auto Merge = B.buildMerge(OpTy, ReadlanePieces);
> + Op.setReg(Merge.getReg(0));
> + }
> +
> + MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
> + }
> }
> }
>
> + B.setInsertPt(*LoopBB, LoopBB->end());
> +
> // Update EXEC, save the original EXEC value to VCC.
> B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
> .addDef(NewExec)
> @@ -488,7 +694,12 @@ void AMDGPURegisterBankInfo::executeInWa
> B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
> .addMBB(LoopBB);
>
> - // Restore the EXEC mask
> + // Save the EXEC mask before the loop.
> + BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
> + .addReg(AMDGPU::EXEC);
> +
> + // Restore the EXEC mask after the loop.
> + B.setMBB(*RestoreExecBB);
> B.buildInstr(AMDGPU::S_MOV_B64_term)
> .addDef(AMDGPU::EXEC)
> .addReg(SaveExecReg);
> @@ -606,6 +817,18 @@ void AMDGPURegisterBankInfo::applyMappin
> applyDefaultMapping(OpdMapper);
> executeInWaterfallLoop(MI, MRI, { 2 });
> return;
> +
> + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
> + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
> + case Intrinsic::amdgcn_buffer_load: {
> + executeInWaterfallLoop(MI, MRI, { 2 });
> + return;
> + }
> + default:
> + break;
> + }
> + break;
> + }
> default:
> break;
> }
> @@ -1012,7 +1235,7 @@ AMDGPURegisterBankInfo::getInstrMapping(
> break;
> }
> case AMDGPU::G_INTRINSIC: {
> - switch (MI.getOperand(1).getIntrinsicID()) {
> + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
> default:
> return getInvalidInstructionMapping();
> case Intrinsic::maxnum:
> @@ -1034,7 +1257,7 @@ AMDGPURegisterBankInfo::getInstrMapping(
> break;
> }
> case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
> - switch (MI.getOperand(0).getIntrinsicID()) {
> + switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
> default:
> return getInvalidInstructionMapping();
> case Intrinsic::amdgcn_exp_compr:
> @@ -1063,7 +1286,33 @@ AMDGPURegisterBankInfo::getInstrMapping(
> OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
> OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
> break;
> + case Intrinsic::amdgcn_buffer_load: {
> + unsigned RSrc = MI.getOperand(2).getReg(); // SGPR
> + unsigned VIndex = MI.getOperand(3).getReg(); // VGPR
> + unsigned Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
> +
> + unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
> + unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
> + unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
> + unsigned Size4 = MRI.getType(Offset).getSizeInBits();
> +
> + unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
> + unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
> +
> + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
> + OpdsMapping[1] = nullptr; // intrinsic id
> +
> + // Lie and claim everything is legal, even though some need to be
> + // SGPRs. applyMapping will have to deal with it as a waterfall loop.
> + OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
> + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
> + OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
> + OpdsMapping[5] = nullptr;
> + OpdsMapping[6] = nullptr;
> + break;
> }
> + }
> +
> break;
> }
> case AMDGPU::G_SELECT: {
> @@ -1121,7 +1370,8 @@ AMDGPURegisterBankInfo::getInstrMapping(
> }
> }
>
> - return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
> + return getInstructionMapping(/*ID*/1, /*Cost*/1,
> + getOperandsMapping(OpdsMapping),
> MI.getNumOperands());
> }
>
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h?rev=361021&r1=361020&r2=361021&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h Fri May 17 05:02:27 2019
> @@ -58,6 +58,22 @@ class AMDGPURegisterBankInfo : public AM
> LLT HalfTy,
> unsigned Reg) const;
>
> + template <unsigned NumOps>
> + struct OpRegBankEntry {
> + int8_t RegBanks[NumOps];
> + int16_t Cost;
> + };
> +
> + template <unsigned NumOps>
> + InstructionMappings
> + addMappingFromTable(const MachineInstr &MI, const MachineRegisterInfo &MRI,
> + const std::array<unsigned, NumOps> RegSrcOpIdx,
> + ArrayRef<OpRegBankEntry<NumOps>> Table) const;
> +
> + RegisterBankInfo::InstructionMappings
> + getInstrAlternativeMappingsIntrinsicWSideEffects(
> + const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
> +
> bool isSALUMapping(const MachineInstr &MI) const;
> const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
> const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
>
> Added: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-buffer-load.mir
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-buffer-load.mir?rev=361021&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-buffer-load.mir (added)
> +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-buffer-load.mir Fri May 17 05:02:27 2019
> @@ -0,0 +1,289 @@
> +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
> +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s
> +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s
> +
> +---
> +name: buffer_load_sss
> +legalized: true
> +tracksRegLiveness: true
> +body: |
> + bb.0:
> + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
> +
> + ; CHECK-LABEL: name: buffer_load_sss
> + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
> + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
> + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
> + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
> + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
> + ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0
> + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
> + %1:_(s32) = COPY $sgpr4
> + %2:_(s32) = COPY $sgpr5
> + %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), %0, %1, %2, 0, 0
> +
> +...
> +
> +---
> +name: buffer_load_ssv
> +legalized: true
> +tracksRegLiveness: true
> +body: |
> + bb.0:
> + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr5
> +
> + ; CHECK-LABEL: name: buffer_load_ssv
> + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr5
> + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
> + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
> + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
> + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
> + ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0
> + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
> + %1:_(s32) = COPY $sgpr4
> + %2:_(s32) = COPY $vgpr5
> + %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), %0, %1, %2, 0, 0
> +
> +...
> +
> +---
> +name: buffer_load_svs
> +legalized: true
> +tracksRegLiveness: true
> +body: |
> + bb.0:
> + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
> +
> + ; CHECK-LABEL: name: buffer_load_svs
> + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
> + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
> + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
> + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
> + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
> + ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0
> + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
> + %1:_(s32) = COPY $sgpr4
> + %2:_(s32) = COPY $sgpr5
> + %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), %0, %1, %2, 0, 0
> +
> +...
> +
> +---
> +name: buffer_load_vss
> +legalized: true
> +tracksRegLiveness: true
> +body: |
> + bb.0:
> + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $sgpr5
> +
> + ; CHECK-LABEL: name: buffer_load_vss
> + ; CHECK: successors: %bb.1(0x80000000)
> + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $sgpr5
> + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
> + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
> + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
> + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
> + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
> + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
> + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
> + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
> + ; CHECK: .1:
> + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
> + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %10, %bb.1
> + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1
> + ; CHECK: [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[UV]](s64)
> + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
> + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
> + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
> + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
> + ; CHECK: [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[UV1]](s64)
> + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
> + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
> + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
> + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
> + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
> + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
> + ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0
> + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
> + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
> + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
> + ; CHECK: .2:
> + ; CHECK: successors: %bb.3(0x80000000)
> + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> + ; CHECK: .3:
> + %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
> + %1:_(s32) = COPY $sgpr4
> + %2:_(s32) = COPY $sgpr5
> + %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), %0, %1, %2, 0, 0
> +
> +...
> +
> +---
> +name: buffer_load_vvs
> +legalized: true
> +tracksRegLiveness: true
> +body: |
> + bb.0:
> + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr5
> +
> + ; CHECK-LABEL: name: buffer_load_vvs
> + ; CHECK: successors: %bb.1(0x80000000)
> + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr5
> + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
> + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
> + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
> + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
> + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
> + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
> + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
> + ; CHECK: .1:
> + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
> + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1
> + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1
> + ; CHECK: [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[UV]](s64)
> + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
> + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
> + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
> + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
> + ; CHECK: [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[UV1]](s64)
> + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
> + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
> + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
> + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
> + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
> + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
> + ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0
> + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
> + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
> + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
> + ; CHECK: .2:
> + ; CHECK: successors: %bb.3(0x80000000)
> + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> + ; CHECK: .3:
> + %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
> + %1:_(s32) = COPY $vgpr4
> + %2:_(s32) = COPY $sgpr5
> + %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), %0, %1, %2, 0, 0
> +
> +...
> +
> +---
> +name: buffer_load_svv
> +legalized: true
> +tracksRegLiveness: true
> +body: |
> + bb.0:
> + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr4, $vgpr5
> +
> + ; CHECK-LABEL: name: buffer_load_svv
> + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr4, $vgpr5
> + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
> + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
> + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
> + ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0
> + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
> + %1:_(s32) = COPY $vgpr4
> + %2:_(s32) = COPY $vgpr5
> + %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), %0, %1, %2, 0, 0
> +
> +...
> +
> +---
> +name: buffer_load_vsv
> +legalized: true
> +tracksRegLiveness: true
> +body: |
> + bb.0:
> + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr5
> +
> + ; CHECK-LABEL: name: buffer_load_vsv
> + ; CHECK: successors: %bb.1(0x80000000)
> + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr5
> + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
> + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
> + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
> + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
> + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
> + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
> + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
> + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
> + ; CHECK: .1:
> + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
> + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %10, %bb.1
> + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1
> + ; CHECK: [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[UV]](s64)
> + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
> + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
> + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
> + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
> + ; CHECK: [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[UV1]](s64)
> + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
> + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
> + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
> + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
> + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
> + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
> + ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0
> + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
> + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
> + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
> + ; CHECK: .2:
> + ; CHECK: successors: %bb.3(0x80000000)
> + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> + ; CHECK: .3:
> + %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
> + %1:_(s32) = COPY $sgpr4
> + %2:_(s32) = COPY $vgpr5
> + %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), %0, %1, %2, 0, 0
> +
> +...
> +
> +---
> +name: buffer_load_vvv
> +legalized: true
> +tracksRegLiveness: true
> +body: |
> + bb.0:
> + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5
> +
> + ; CHECK-LABEL: name: buffer_load_vvv
> + ; CHECK: successors: %bb.1(0x80000000)
> + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5
> + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
> + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
> + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
> + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
> + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
> + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
> + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
> + ; CHECK: .1:
> + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
> + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1
> + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1
> + ; CHECK: [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[UV]](s64)
> + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
> + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
> + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32)
> + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec
> + ; CHECK: [[UV4:%[0-9]+]]:vgpr_32(s32), [[UV5:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[UV1]](s64)
> + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV4]](s32), implicit $exec
> + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV5]](s32), implicit $exec
> + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
> + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
> + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
> + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
> + ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0
> + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
> + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
> + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
> + ; CHECK: .2:
> + ; CHECK: successors: %bb.3(0x80000000)
> + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> + ; CHECK: .3:
> + %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
> + %1:_(s32) = COPY $vgpr4
> + %2:_(s32) = COPY $vgpr5
> + %3:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), %0, %1, %2, 0, 0
> +
> +...
> +
>
> Modified: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir?rev=361021&r1=361020&r2=361021&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir (original)
> +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir Fri May 17 05:02:27 2019
> @@ -44,14 +44,14 @@ body: |
> ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1
> ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1
> ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
> - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY2]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
> ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
> + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY2]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
> ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
> ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
> ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
> - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> ; CHECK: .2:
> ; CHECK: successors: %bb.3(0x80000000)
> + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> ; CHECK: .3:
> ; CHECK: $vgpr0 = COPY [[EVEC]](s32)
> %0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
> @@ -101,14 +101,14 @@ body: |
> ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1
> ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1
> ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
> - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
> ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
> + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
> ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
> ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
> ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
> - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> ; CHECK: .2:
> ; CHECK: successors: %bb.3(0x80000000)
> + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> ; CHECK: .3:
> ; CHECK: $vgpr0 = COPY [[EVEC]](s32)
> %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
> @@ -138,14 +138,14 @@ body: |
> ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1
> ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s64) = G_PHI [[DEF]](s64), %bb.0, %2(s64), %bb.1
> ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
> - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[V_READFIRSTLANE_B32_]](s32)
> ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
> + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[V_READFIRSTLANE_B32_]](s32)
> ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
> ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
> ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
> - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> ; CHECK: .2:
> ; CHECK: successors: %bb.3(0x80000000)
> + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
> ; CHECK: .3:
> ; CHECK: $vgpr0_vgpr1 = COPY [[EVEC]](s64)
> %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
More information about the llvm-commits
mailing list