[llvm] r359935 - [AMDGPU] gfx1010 loop alignment
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri May 3 14:17:30 PDT 2019
Author: rampitec
Date: Fri May 3 14:17:29 2019
New Revision: 359935
URL: http://llvm.org/viewvc/llvm-project?rev=359935&view=rev
Log:
[AMDGPU] gfx1010 loop alignment
Differential Revision: https://reviews.llvm.org/D61529
Modified:
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=359935&r1=359934&r2=359935&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri May 3 14:17:29 2019
@@ -99,6 +99,11 @@ static cl::opt<unsigned> AssumeFrameInde
cl::init(5),
cl::ReallyHidden);
+static cl::opt<bool> DisableLoopAlignment(
+ "amdgpu-disable-loop-alignment",
+ cl::desc("Do not align and prefetch loops"),
+ cl::init(false));
+
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -9966,6 +9971,77 @@ void SITargetLowering::computeKnownBitsF
Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
}
+unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
+ const unsigned CacheLineAlign = 6; // log2(64)
+
+ // Pre-GFX10 target did not benefit from loop alignment
+ if (!ML || DisableLoopAlignment ||
+ (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
+ getSubtarget()->hasInstFwdPrefetchBug())
+ return PrefAlign;
+
+ // On GFX10 I$ is 4 x 64 bytes cache lines.
+ // By default prefetcher keeps one cache line behind and reads two ahead.
+ // We can modify it with S_INST_PREFETCH for larger loops to have two lines
+ // behind and one ahead.
+ // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
+ // If loop fits 64 bytes it always spans no more than two cache lines and
+ // does not need an alignment.
+ // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
+ // Else if loop is less or equal 192 bytes we need two lines behind.
+
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const MachineBasicBlock *Header = ML->getHeader();
+ if (Header->getAlignment() != PrefAlign)
+ return Header->getAlignment(); // Already processed.
+
+ unsigned LoopSize = 0;
+ for (const MachineBasicBlock *MBB : ML->blocks()) {
+ // If inner loop block is aligned assume in average half of the alignment
+ // size to be added as nops.
+ if (MBB != Header)
+ LoopSize += (1 << MBB->getAlignment()) / 2;
+
+ for (const MachineInstr &MI : *MBB) {
+ LoopSize += TII->getInstSizeInBytes(MI);
+ if (LoopSize > 192)
+ return PrefAlign;
+ }
+ }
+
+ if (LoopSize <= 64)
+ return PrefAlign;
+
+ if (LoopSize <= 128)
+ return CacheLineAlign;
+
+ // If any of parent loops is surrounded by prefetch instructions do not
+ // insert new for inner loop, which would reset parent's settings.
+ for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
+ if (MachineBasicBlock *Exit = P->getExitBlock()) {
+ auto I = Exit->getFirstNonDebugInstr();
+ if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
+ return CacheLineAlign;
+ }
+ }
+
+ MachineBasicBlock *Pre = ML->getLoopPreheader();
+ MachineBasicBlock *Exit = ML->getExitBlock();
+
+ if (Pre && Exit) {
+ BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
+ TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(1); // prefetch 2 lines behind PC
+
+ BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
+ TII->get(AMDGPU::S_INST_PREFETCH))
+ .addImm(2); // prefetch 1 line behind PC
+ }
+
+ return CacheLineAlign;
+}
+
LLVM_ATTRIBUTE_UNUSED
static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
assert(N->getOpcode() == ISD::CopyFromReg);
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=359935&r1=359934&r2=359935&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Fri May 3 14:17:29 2019
@@ -367,6 +367,8 @@ public:
bool SNaN = false,
unsigned Depth = 0) const override;
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+
+ unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
};
} // End namespace llvm
More information about the llvm-commits
mailing list