[llvm] r346364 - Revert "AMDGPU: Divergence-driven selection of scalar buffer load intrinsics"
Nicolai Haehnle via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 7 13:53:44 PST 2018
Author: nha
Date: Wed Nov 7 13:53:43 2018
New Revision: 346364
URL: http://llvm.org/viewvc/llvm-project?rev=346364&view=rev
Log:
Revert "AMDGPU: Divergence-driven selection of scalar buffer load intrinsics"
This reverts commit r344696 for now (except for some test additions).
See https://bugs.freedesktop.org/show_bug.cgi?id=108611.
Modified:
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/trunk/test/CodeGen/AMDGPU/smrd-fold-offset.mir
llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=346364&r1=346363&r2=346364&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Nov 7 13:53:43 2018
@@ -4847,70 +4847,6 @@ SDValue SITargetLowering::lowerImage(SDV
return SDValue(NewNode, 0);
}
-SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
- SDValue Offset, SDValue GLC,
- SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- VT.getStoreSize(), VT.getStoreSize());
-
- if (!Offset->isDivergent()) {
- SDValue Ops[] = {
- Rsrc,
- Offset, // Offset
- GLC // glc
- };
- return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
- DAG.getVTList(VT), Ops, VT, MMO);
- }
-
- // We have a divergent offset. Emit a MUBUF buffer load instead. We can
- // assume that the buffer is unswizzled.
- SmallVector<SDValue, 4> Loads;
- unsigned NumLoads = 1;
- MVT LoadVT = VT.getSimpleVT();
-
- assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||
- LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32);
-
- if (VT == MVT::v8i32 || VT == MVT::v16i32) {
- NumLoads = VT == MVT::v16i32 ? 4 : 2;
- LoadVT = MVT::v4i32;
- }
-
- SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
- unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
- SDValue Ops[] = {
- DAG.getEntryNode(), // Chain
- Rsrc, // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- {}, // voffset
- {}, // soffset
- {}, // offset
- DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
- };
-
- // Use the alignment to ensure that the required offsets will fit into the
- // immediate offsets.
- setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
-
- uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
- for (unsigned i = 0; i < NumLoads; ++i) {
- Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
- Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
- Ops, LoadVT, MMO));
- }
-
- if (VT == MVT::v8i32 || VT == MVT::v16i32)
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
-
- return Loads[0];
-}
-
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -5065,15 +5001,38 @@ SDValue SITargetLowering::LowerINTRINSIC
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
case AMDGPUIntrinsic::SI_load_const: {
- SDValue Load =
- lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
- DAG.getTargetConstant(0, DL, MVT::i1), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(1), // Ptr
+ Op.getOperand(2), // Offset
+ DAG.getTargetConstant(0, DL, MVT::i1) // glc
+ };
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), 4);
+ SDVTList VTList = DAG.getVTList(MVT::i32);
+ SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+ VTList, Ops, MVT::i32, MMO);
+
return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
}
case Intrinsic::amdgcn_s_buffer_load: {
unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
- return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
- DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(1), // Ptr
+ Op.getOperand(2), // Offset
+ DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
+ };
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), VT.getStoreSize());
+ return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+ Op->getVTList(), Ops, VT, MMO);
}
case Intrinsic::amdgcn_fdiv_fast:
return lowerFDIV_FAST(Op, DAG);
@@ -6108,13 +6067,13 @@ std::pair<SDValue, SDValue> SITargetLowe
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
- SelectionDAG &DAG, SDValue *Offsets,
- unsigned Align) const {
+ SelectionDAG &DAG,
+ SDValue *Offsets) const {
SDLoc DL(CombinedOffset);
if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
uint32_t Imm = C->getZExtValue();
uint32_t SOffset, ImmOffset;
- if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
+ if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) {
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
@@ -6126,8 +6085,8 @@ void SITargetLowering::setBufferOffsets(
SDValue N1 = CombinedOffset.getOperand(1);
uint32_t SOffset, ImmOffset;
int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
- if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
- Subtarget, Align)) {
+ if (Offset >= 0
+ && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) {
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=346364&r1=346363&r2=346364&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Wed Nov 7 13:53:43 2018
@@ -60,8 +60,6 @@ private:
MVT VT, unsigned Offset) const;
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG) const;
- SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
- SDValue GLC, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -192,7 +190,7 @@ private:
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
- SDValue *Offsets, unsigned Align = 4) const;
+ SDValue *Offsets) const;
public:
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=346364&r1=346363&r2=346364&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Wed Nov 7 13:53:43 2018
@@ -3558,13 +3558,8 @@ void SIInstrInfo::legalizeOperandsSMRD(M
// pointer value is uniform.
MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
- SBase->setReg(SGPR);
- }
- MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
- if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
- SOff->setReg(SGPR);
+ unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+ SBase->setReg(SGPR);
}
}
@@ -4193,6 +4188,115 @@ void SIInstrInfo::moveToVALU(MachineInst
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
Inst.eraseFromParent();
continue;
+
+ case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+ unsigned VDst;
+ unsigned NewOpcode;
+
+ switch(Opcode) {
+ case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+ splitScalarBuffer(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
+ }
+
+ const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
+ auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
+ unsigned Offset = 0;
+
+ // FIXME: This isn't safe because the addressing mode doesn't work
+ // correctly if vaddr is negative.
+ //
+ // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
+ //
+ // See if we can extract an immediate offset by recognizing one of these:
+ // V_ADD_I32_e32 dst, imm, src1
+ // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
+ // V_ADD will be removed by "Remove dead machine instructions".
+ if (Add &&
+ (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
+ Add->getOpcode() == AMDGPU::V_ADD_U32_e32 ||
+ Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
+ static const unsigned SrcNames[2] = {
+ AMDGPU::OpName::src0,
+ AMDGPU::OpName::src1,
+ };
+
+ // Find a literal offset in one of source operands.
+ for (int i = 0; i < 2; i++) {
+ const MachineOperand *Src =
+ getNamedOperand(*Add, SrcNames[i]);
+
+ if (Src->isReg()) {
+ MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg());
+ if (Def) {
+ if (Def->isMoveImmediate())
+ Src = &Def->getOperand(1);
+ else if (Def->isCopy()) {
+ auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
+ if (Mov && Mov->isMoveImmediate()) {
+ Src = &Mov->getOperand(1);
+ }
+ }
+ }
+ }
+
+ if (Src) {
+ if (Src->isImm())
+ Offset = Src->getImm();
+ else if (Src->isCImm())
+ Offset = Src->getCImm()->getZExtValue();
+ }
+
+ if (Offset && isLegalMUBUFImmOffset(Offset)) {
+ VAddr = getNamedOperand(*Add, SrcNames[!i]);
+ break;
+ }
+
+ Offset = 0;
+ }
+ }
+
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(),
+ get(NewOpcode), VDst)
+ .add(*VAddr) // vaddr
+ .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
+ .addImm(0) // soffset
+ .addImm(Offset) // offset
+ .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .cloneMemRefs(Inst)
+ .getInstr();
+
+ MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
+ VDst);
+ addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
+ Inst.eraseFromParent();
+
+ // Legalize all operands other than the offset. Notably, convert the srsrc
+ // into SGPRs using v_readfirstlane if needed.
+ legalizeOperands(*NewInstr, MDT);
+ continue;
+ }
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -4674,6 +4778,73 @@ void SIInstrInfo::splitScalar64BitBFE(Se
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineBasicBlock::iterator MII = Inst;
+ auto &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
+ MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
+ MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
+ MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
+
+ unsigned Opcode = Inst.getOpcode();
+ unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+ unsigned Count = 0;
+ const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+
+ switch(Opcode) {
+ default:
+ return;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ Count = 2;
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+ Count = 4;
+ break;
+ }
+
+ // FIXME: Should also attempt to build VAddr and Offset like the non-split
+ // case (see call site for this function)
+
+ // Create a vector of result registers
+ SmallVector<unsigned, 8> ResultRegs;
+ for (unsigned i = 0; i < Count ; ++i) {
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+ MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
+ .addReg(Offset.getReg()) // offset
+ .addReg(Rsrc.getReg()) // rsrc
+ .addImm(0) // soffset
+ .addImm(i << 4) // inst_offset
+ .addImm(Glc.getImm()) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addMemOperand(*Inst.memoperands_begin());
+ // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
+ auto &NewDestOp = NewMI.getOperand(0);
+ for (unsigned i = 0 ; i < 4 ; i++)
+ ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
+ RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
+ }
+ // Create a new combined result to replace original with
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
+ get(TargetOpcode::REG_SEQUENCE), FullDestReg);
+
+ for (unsigned i = 0 ; i < Count * 4 ; ++i) {
+ CombinedResBuilder
+ .addReg(ResultRegs[i])
+ .addImm(RI.getSubRegFromChannel(i));
+ }
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
void SIInstrInfo::addUsersToMoveToVALUWorklist(
unsigned DstReg,
MachineRegisterInfo &MRI,
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h?rev=346364&r1=346363&r2=346364&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h Wed Nov 7 13:53:43 2018
@@ -103,6 +103,8 @@ private:
MachineInstr &Inst) const;
void splitScalar64BitBFE(SetVectorType &Worklist,
MachineInstr &Inst) const;
+ void splitScalarBuffer(SetVectorType &Worklist,
+ MachineInstr &Inst) const;
void movePackToVALU(SetVectorType &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
Modified: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp?rev=346364&r1=346363&r2=346364&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp Wed Nov 7 13:53:43 2018
@@ -894,12 +894,9 @@ bool isLegalSMRDImmOffset(const MCSubtar
// Given Imm, split it into the values to put into the SOffset and ImmOffset
// fields in an MUBUF instruction. Return false if it is not possible (due to a
// hardware bug needing a workaround).
-//
-// The required alignment ensures that individual address components remain
-// aligned if they are aligned to begin with. It also ensures that additional
-// offsets within the given alignment can be added to the resulting ImmOffset.
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
- const GCNSubtarget *Subtarget, uint32_t Align) {
+ const GCNSubtarget *Subtarget) {
+ const uint32_t Align = 4;
const uint32_t MaxImm = alignDown(4095, Align);
uint32_t Overflow = 0;
Modified: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h?rev=346364&r1=346363&r2=346364&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h Wed Nov 7 13:53:43 2018
@@ -441,8 +441,11 @@ int64_t getSMRDEncodedOffset(const MCSub
/// not the encoded offset.
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+// Given Imm, split it into the values to put into the SOffset and ImmOffset
+// fields in an MUBUF instruction. Return false if it is not possible (due to a
+// hardware bug needing a workaround).
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
- const GCNSubtarget *Subtarget, uint32_t Align = 4);
+ const GCNSubtarget *Subtarget);
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
Modified: llvm/trunk/test/CodeGen/AMDGPU/smrd-fold-offset.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/smrd-fold-offset.mir?rev=346364&r1=346363&r2=346364&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/smrd-fold-offset.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/smrd-fold-offset.mir Wed Nov 7 13:53:43 2018
@@ -1,8 +1,6 @@
# RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s
-# GCN-LABEL: name: smrd_vgpr_offset_imm
-# GCN: V_READFIRSTLANE_B32
-# GCN: S_BUFFER_LOAD_DWORD_SGPR
+# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
---
name: smrd_vgpr_offset_imm
body: |
@@ -24,9 +22,7 @@ body: |
SI_RETURN_TO_EPILOG $vgpr0
...
-# GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32
-# GCN: V_READFIRSTLANE_B32
-# GCN: S_BUFFER_LOAD_DWORD_SGPR
+# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095
---
name: smrd_vgpr_offset_imm_add_u32
body: |
Modified: llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/smrd.ll?rev=346364&r1=346363&r2=346364&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/smrd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/smrd.ll Wed Nov 7 13:53:43 2018
@@ -292,19 +292,18 @@ main_body:
; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
; GCN-NEXT: %bb.
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
+; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ;
define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
main_body:
- %off = add i32 %offset, 4092
+ %off = add i32 %offset, 4095
%r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
ret float %r
}
; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
; GCN-NEXT: %bb.
-; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
-; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
-; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
+; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
+; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
main_body:
%off = add i32 %offset, 4096
@@ -511,15 +510,12 @@ main_body:
}
; GCN-LABEL: {{^}}smrd_load_nonconst4:
-; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
-; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
-; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
-; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
-; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
+; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
+; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0xff8, v0 ;
+; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
; GCN: ; return to shader part epilog
define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
main_body:
@@ -530,16 +526,12 @@ main_body:
}
; GCN-LABEL: {{^}}smrd_load_nonconst5:
-; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
-; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
-; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
-; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
-; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
-; VIGFX9: s_movk_i32 s4, 0xfc0
-; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
-; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
+; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0x1004, v0
+; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x1004, v0
+; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
; GCN: ; return to shader part epilog
define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
main_body:
@@ -567,10 +559,9 @@ main_body:
; GCN-LABEL: {{^}}smrd_uniform_loop:
;
-; TODO: we should keep the loop counter in an SGPR
+; TODO: this should use an s_buffer_load
;
-; GCN: v_readfirstlane_b32
-; GCN: s_buffer_load_dword
+; GCN: buffer_load_dword
define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
main_body:
br label %loop
@@ -594,10 +585,9 @@ exit:
; (this test differs from smrd_uniform_loop by the more complex structure of phis,
; which used to confuse the DivergenceAnalysis after structurization)
;
-; TODO: we should keep the loop counter in an SGPR
+; TODO: we should keep the loop counter in an SGPR and use an S_BUFFER_LOAD
;
-; GCN: v_readfirstlane_b32
-; GCN: s_buffer_load_dword
+; GCN: buffer_load_dword
define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 {
main_body:
br label %loop
More information about the llvm-commits
mailing list