[PATCH] D84354: [AMDGPU/MemOpsCluster] Clean-up fixme's around mem ops clustering logic
Matt Arsenault via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 22 16:15:04 PDT 2020
arsenm added inline comments.
================
Comment at: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp:477
unsigned NumBytes) const {
+ // If the mem ops in the current pair, do not have the same base ptr, then
+ // they cannot be clustered
----------------
Reads badly with the first comma
================
Comment at: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp:485
- const MachineOperand *FirstDst = nullptr;
- const MachineOperand *SecondDst = nullptr;
-
- if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
- (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
- (isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) ||
- (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
- const unsigned MaxGlobalLoadCluster = 7;
- if (NumLoads > MaxGlobalLoadCluster)
- return false;
-
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
- if (!FirstDst)
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
- if (!SecondDst)
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
- } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
- } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
- }
-
- if (!FirstDst || !SecondDst)
- return false;
-
- // Try to limit clustering based on the total number of bytes loaded
- // rather than the number of instructions. This is done to help reduce
- // register pressure. The method used is somewhat inexact, though,
- // because it assumes that all loads in the cluster will load the
- // same number of bytes as FirstLdSt.
-
- // The unit of this value is bytes.
- // FIXME: This needs finer tuning.
- unsigned LoadClusterThreshold = 16;
-
- const MachineRegisterInfo &MRI =
- FirstLdSt.getParent()->getParent()->getRegInfo();
-
- const Register Reg = FirstDst->getReg();
-
- const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
- ? MRI.getRegClass(Reg)
- : RI.getPhysRegClass(Reg);
-
- // FIXME: NumLoads should not be subtracted 1. This is to match behavior
- // of clusterNeighboringMemOps which was previosly passing cluster length
- // less 1. LoadClusterThreshold should be tuned instead.
- return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <=
- LoadClusterThreshold;
+ // The total combined max number of bytes that can be loaded by clustered
+ // mem ops is set to 32 bytes.
----------------
saying total max is redundant
================
Comment at: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp:486
+ // The total combined max number of bytes that can be loaded by clustered
+ // mem ops is set to 32 bytes.
+ return NumBytes <= 32;
----------------
drop is set
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D84354/new/
https://reviews.llvm.org/D84354
More information about the llvm-commits
mailing list