[llvm] [AMDGPU] Refactor out common exec mask opcode patterns (NFCI) (PR #154718)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 21 03:07:41 PDT 2025
https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/154718
Create utility mechanism for finding wave size dependent opcodes used to manipulate exec/lane masks.
>From 09ba67c6beee6611b3e1f25adce9e88a74cce019 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 21 Aug 2025 14:37:42 +0900
Subject: [PATCH] [AMDGPU] Refactor out common exec mask opcode patterns (NFCI)
Create utility mechanism for finding wave size dependent opcodes
used to manipulate exec/lane masks.
---
.../lib/Target/AMDGPU/AMDGPULaneMaskUtils.cpp | 75 +++++++++
llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h | 52 +++++++
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 36 ++---
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 15 +-
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 21 ++-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 24 ++-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 106 +++++--------
.../Target/AMDGPU/SILateBranchLowering.cpp | 23 ++-
llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 112 +++++---------
.../Target/AMDGPU/SIOptimizeExecMasking.cpp | 94 ++++++------
.../AMDGPU/SIOptimizeExecMaskingPreRA.cpp | 29 ++--
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 142 +++++++-----------
13 files changed, 362 insertions(+), 368 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.cpp
new file mode 100644
index 0000000000000..8690afcdeef6d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPULaneMaskUtils.cpp - -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULaneMaskUtils.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+
+#define DEBUG_TYPE "amdgpu-lane-mask-utils"
+
+using namespace llvm;
+
+namespace llvm::AMDGPU {
+
+LaneMaskConstants::LaneMaskConstants(unsigned WavefrontSize) {
+ if (WavefrontSize == 32) {
+ ExecReg = AMDGPU::EXEC_LO;
+ VccReg = AMDGPU::VCC_LO;
+ AndOpc = AMDGPU::S_AND_B32;
+ AndTermOpc = AMDGPU::S_AND_B32_term;
+ AndN2Opc = AMDGPU::S_ANDN2_B32;
+ AndN2SaveExecOpc = AMDGPU::S_ANDN2_SAVEEXEC_B32;
+ AndN2TermOpc = AMDGPU::S_ANDN2_B32_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
+ AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
+ BfmOpc = AMDGPU::S_BFM_B32;
+ CMovOpc = AMDGPU::S_CMOV_B32;
+ CSelectOpc = AMDGPU::S_CSELECT_B32;
+ MovOpc = AMDGPU::S_MOV_B32;
+ MovTermOpc = AMDGPU::S_MOV_B32_term;
+ OrOpc = AMDGPU::S_OR_B32;
+ OrTermOpc = AMDGPU::S_OR_B32_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+ XorOpc = AMDGPU::S_XOR_B32;
+ XorTermOpc = AMDGPU::S_XOR_B32_term;
+ WQMOpc = AMDGPU::S_WQM_B32;
+ } else {
+ ExecReg = AMDGPU::EXEC;
+ VccReg = AMDGPU::VCC;
+ AndOpc = AMDGPU::S_AND_B64;
+ AndTermOpc = AMDGPU::S_AND_B64_term;
+ AndN2Opc = AMDGPU::S_ANDN2_B64;
+ AndN2SaveExecOpc = AMDGPU::S_ANDN2_SAVEEXEC_B64;
+ AndN2TermOpc = AMDGPU::S_ANDN2_B64_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
+ AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
+ BfmOpc = AMDGPU::S_BFM_B64;
+ CMovOpc = AMDGPU::S_CMOV_B64;
+ CSelectOpc = AMDGPU::S_CSELECT_B64;
+ MovOpc = AMDGPU::S_MOV_B64;
+ MovTermOpc = AMDGPU::S_MOV_B64_term;
+ OrOpc = AMDGPU::S_OR_B64;
+ OrTermOpc = AMDGPU::S_OR_B64_term;
+ OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+ XorOpc = AMDGPU::S_XOR_B64;
+ XorTermOpc = AMDGPU::S_XOR_B64_term;
+ WQMOpc = AMDGPU::S_WQM_B64;
+ }
+}
+
+static const LaneMaskConstants Wave32LaneMaskConstants(32);
+static const LaneMaskConstants Wave64LaneMaskConstants(64);
+
+const LaneMaskConstants &getLaneMaskConstants(const GCNSubtarget *ST) {
+ unsigned WavefrontSize = ST->getWavefrontSize();
+ assert(WavefrontSize == 32 || WavefrontSize == 64);
+ return WavefrontSize == 32 ? Wave32LaneMaskConstants
+ : Wave64LaneMaskConstants;
+}
+
+} // end namespace llvm::AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
new file mode 100644
index 0000000000000..6c11dbd73ef3b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
@@ -0,0 +1,52 @@
+//===- AMDGPULaneMaskUtils.h - Exec/lane mask helper functions -*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
+
+#include "llvm/CodeGen/Register.h"
+
+namespace llvm {
+
+class GCNSubtarget;
+
+namespace AMDGPU {
+
+class LaneMaskConstants {
+public:
+ Register ExecReg;
+ Register VccReg;
+ unsigned AndOpc;
+ unsigned AndTermOpc;
+ unsigned AndN2Opc;
+ unsigned AndN2SaveExecOpc;
+ unsigned AndN2TermOpc;
+ unsigned AndSaveExecOpc;
+ unsigned AndSaveExecTermOpc;
+ unsigned BfmOpc;
+ unsigned CMovOpc;
+ unsigned CSelectOpc;
+ unsigned MovOpc;
+ unsigned MovTermOpc;
+ unsigned OrOpc;
+ unsigned OrTermOpc;
+ unsigned OrSaveExecOpc;
+ unsigned XorOpc;
+ unsigned XorTermOpc;
+ unsigned WQMOpc;
+
+ LaneMaskConstants(unsigned WavefrontSize);
+};
+
+const LaneMaskConstants &getLaneMaskConstants(const GCNSubtarget *ST);
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEMASKUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 237929699dd9d..7c37347139607 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -73,6 +73,7 @@
#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
@@ -783,17 +784,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineFunction *MF = &B.getMF();
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
- const unsigned MovExecOpc =
- Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- const unsigned MovExecTermOpc =
- Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
-
- const unsigned XorTermOpc = Subtarget.isWave32() ?
- AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
- const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
- AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
- const unsigned ExecReg = Subtarget.isWave32() ?
- AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const AMDGPU::LaneMaskConstants &LMC =
+ AMDGPU::getLaneMaskConstants(&Subtarget);
#ifndef NDEBUG
const int OrigRangeSize = std::distance(Range.begin(), Range.end());
@@ -941,19 +933,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MRI.setRegClass(CondReg, WaveRC);
// Update EXEC, save the original EXEC value to VCC.
- B.buildInstr(AndSaveExecOpc)
- .addDef(NewExec)
- .addReg(CondReg, RegState::Kill);
+ B.buildInstr(LMC.AndSaveExecOpc)
+ .addDef(NewExec)
+ .addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
B.setInsertPt(*BodyBB, BodyBB->end());
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
- B.buildInstr(XorTermOpc)
- .addDef(ExecReg)
- .addReg(ExecReg)
- .addReg(NewExec);
+ B.buildInstr(LMC.XorTermOpc)
+ .addDef(LMC.ExecReg)
+ .addReg(LMC.ExecReg)
+ .addReg(NewExec);
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?
@@ -962,14 +954,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
// Save the EXEC mask before the loop.
- BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
- .addReg(ExecReg);
+ BuildMI(MBB, MBB.end(), DL, TII->get(LMC.MovOpc), SaveExecReg)
+ .addReg(LMC.ExecReg);
// Restore the EXEC mask after the loop.
B.setMBB(*RestoreExecBB);
- B.buildInstr(MovExecTermOpc)
- .addDef(ExecReg)
- .addReg(SaveExecReg);
+ B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
// Set the insert point after the original instruction, so any new
// instructions will be in the remainder.
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130ea..f2baf0787bcf4 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -69,6 +69,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULegalizerInfo.cpp
AMDGPULibCalls.cpp
AMDGPUImageIntrinsicOptimizer.cpp
+ AMDGPULaneMaskUtils.cpp
AMDGPULibFunc.cpp
AMDGPULowerBufferFatPointers.cpp
AMDGPULowerKernelArguments.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index dce4e6f993005..64e8e5e4cc00a 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -66,6 +66,7 @@
#include "SIFixSGPRCopies.h"
#include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -1134,7 +1135,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
}
void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
- bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32();
+ const AMDGPU::LaneMaskConstants &LMC =
+ AMDGPU::getLaneMaskConstants(&MF.getSubtarget<GCNSubtarget>());
for (MachineBasicBlock &MBB : MF) {
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
++I) {
@@ -1148,10 +1150,7 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
Register SCCCopy =
MRI->createVirtualRegister(TRI->getWaveMaskRegClass());
I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
- MI.getDebugLoc(),
- TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32
- : AMDGPU::S_CSELECT_B64),
- SCCCopy)
+ MI.getDebugLoc(), TII->get(LMC.CSelectOpc), SCCCopy)
.addImm(-1)
.addImm(0);
I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
@@ -1161,14 +1160,12 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
continue;
}
if (DstReg == AMDGPU::SCC) {
- unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
- Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());
I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)),
- MI.getDebugLoc(), TII->get(Opcode))
+ MI.getDebugLoc(), TII->get(LMC.AndOpc))
.addReg(Tmp, getDefRegState(true))
.addReg(SrcReg)
- .addReg(Exec);
+ .addReg(LMC.ExecReg);
MI.eraseFromParent();
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 9b348d46fec4f..7895c518103d5 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -8,6 +8,7 @@
#include "SIFrameLowering.h"
#include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
@@ -984,6 +985,7 @@ void SIFrameLowering::emitCSRSpillStores(
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
// Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
// registers. However, save all lanes of callee-saved VGPRs. Due to this, we
@@ -1015,8 +1017,7 @@ void SIFrameLowering::emitCSRSpillStores(
StoreWWMRegisters(WWMScratchRegs);
auto EnableAllLanes = [&]() {
- unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+ BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
};
if (!WWMCalleeSavedRegs.empty()) {
@@ -1043,8 +1044,7 @@ void SIFrameLowering::emitCSRSpillStores(
TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
} else if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
- unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
+ BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
.addReg(ScratchExecCopy, RegState::Kill);
LiveUnits.addReg(ScratchExecCopy);
}
@@ -1092,6 +1092,7 @@ void SIFrameLowering::emitCSRSpillRestores(
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
@@ -1138,16 +1139,14 @@ void SIFrameLowering::emitCSRSpillRestores(
Register OrigExec = Return.getOperand(0).getReg();
if (!WWMScratchRegs.empty()) {
- unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
- BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
+ BuildMI(MBB, MBBI, DL, TII->get(LMC.XorOpc), LMC.ExecReg)
.addReg(OrigExec)
.addImm(-1);
RestoreWWMRegisters(WWMScratchRegs);
}
// Restore original EXEC.
- unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+ BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addReg(OrigExec);
return;
}
@@ -1159,8 +1158,7 @@ void SIFrameLowering::emitCSRSpillRestores(
RestoreWWMRegisters(WWMScratchRegs);
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
- unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+ BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
} else {
ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ false,
@@ -1171,8 +1169,7 @@ void SIFrameLowering::emitCSRSpillRestores(
RestoreWWMRegisters(WWMCalleeSavedRegs);
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
- unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
+ BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
.addReg(ScratchExecCopy, RegState::Kill);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..74f348b6771cd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14,6 +14,7 @@
#include "SIISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPULaneMaskUtils.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -4809,6 +4810,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
MachineFunction *MF = OrigBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
MachineBasicBlock::iterator I = LoopBB.begin();
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
@@ -4840,10 +4842,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
.addReg(Idx.getReg(), 0, Idx.getSubReg());
// Update EXEC, save the original EXEC value to VCC.
- BuildMI(LoopBB, I, DL,
- TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
- : AMDGPU::S_AND_SAVEEXEC_B64),
- NewExec)
+ BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
.addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
@@ -4870,13 +4869,9 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
}
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
MachineInstr *InsertPt =
- BuildMI(LoopBB, I, DL,
- TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
- : AMDGPU::S_XOR_B64_term),
- Exec)
- .addReg(Exec)
+ BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
.addReg(NewExec);
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
@@ -4911,15 +4906,14 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
// Save the EXEC mask
// clang-format off
- BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
- .addReg(Exec);
+ BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
+ .addReg(LMC.ExecReg);
// clang-format on
auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
@@ -4939,7 +4933,7 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
LoopBB->addSuccessor(LandingPad);
MachineBasicBlock::iterator First = LandingPad->begin();
// clang-format off
- BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
+ BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
.addReg(SaveExec);
// clang-format on
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index df638bd65bdaa..7800897322d24 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -14,6 +14,7 @@
#include "SIInstrInfo.h"
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPULaneMaskUtils.h"
#include "GCNHazardRecognizer.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
@@ -1195,6 +1196,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
Register FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
"Not a VGPR32 reg");
@@ -1213,10 +1215,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
switch (Cond[0].getImm()) {
case SIInstrInfo::SCC_TRUE: {
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
- BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
- : AMDGPU::S_CSELECT_B64), SReg)
- .addImm(1)
- .addImm(0);
+ BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
@@ -1227,10 +1226,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
}
case SIInstrInfo::SCC_FALSE: {
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
- BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
- : AMDGPU::S_CSELECT_B64), SReg)
- .addImm(0)
- .addImm(1);
+ BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
@@ -1270,13 +1266,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::EXECNZ: {
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
- BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
- : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
- .addImm(0);
- BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
- : AMDGPU::S_CSELECT_B64), SReg)
- .addImm(1)
- .addImm(0);
+ BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
+ BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
@@ -1288,13 +1279,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::EXECZ: {
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
- BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
- : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
- .addImm(0);
- BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
- : AMDGPU::S_CSELECT_B64), SReg)
- .addImm(0)
- .addImm(1);
+ BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
+ BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
@@ -2046,6 +2032,7 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
switch (MI.getOpcode()) {
default: return TargetInstrInfo::expandPostRAPseudo(MI);
case AMDGPU::S_MOV_B64_term:
@@ -2470,18 +2457,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::ENTER_STRICT_WWM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
// Whole Wave Mode is entered.
- MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
- : AMDGPU::S_OR_SAVEEXEC_B64));
+ MI.setDesc(get(LMC.OrSaveExecOpc));
break;
}
case AMDGPU::ENTER_STRICT_WQM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
// STRICT_WQM is entered.
- const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
- const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
- BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
+ BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
+ .addReg(LMC.ExecReg);
+ BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
MI.eraseFromParent();
break;
@@ -2490,7 +2474,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::EXIT_STRICT_WQM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
// WWM/STICT_WQM is exited.
- MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
+ MI.setDesc(get(LMC.MovOpc));
break;
}
case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
@@ -5821,25 +5805,22 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
SlotIndexes *Indexes) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- bool IsWave32 = ST.isWave32();
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
if (IsSCCLive) {
// Insert two move instructions, one to save the original value of EXEC and
// the other to turn on all bits in EXEC. This is required as we can't use
// the single instruction S_OR_SAVEEXEC that clobbers SCC.
- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
- .addReg(Exec, RegState::Kill);
- auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+ auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
+ .addReg(LMC.ExecReg, RegState::Kill);
+ auto FlipExecMI =
+ BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
if (Indexes) {
Indexes->insertMachineInstrInMaps(*StoreExecMI);
Indexes->insertMachineInstrInMaps(*FlipExecMI);
}
} else {
- const unsigned OrSaveExec =
- IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
auto SaveExec =
- BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
+ BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
if (Indexes)
Indexes->insertMachineInstrInMaps(*SaveExec);
@@ -5850,10 +5831,9 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, Register Reg,
SlotIndexes *Indexes) const {
- unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- auto ExecRestoreMI =
- BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
+ auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
+ .addReg(Reg, RegState::Kill);
if (Indexes)
Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
}
@@ -6718,13 +6698,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
MachineFunction &MF = *LoopBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- unsigned SaveExecOpc =
- ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
- unsigned XorTermOpc =
- ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
- unsigned AndOpc =
- ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
MachineBasicBlock::iterator I = LoopBB.begin();
@@ -6752,7 +6726,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
CondReg = NewCondReg;
else { // If not the first, we create an AND.
Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
- BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
+ BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
.addReg(CondReg)
.addReg(NewCondReg);
CondReg = AndReg;
@@ -6808,7 +6782,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
CondReg = NewCondReg;
else { // If not the first, we create an AND.
Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
- BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
+ BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
.addReg(CondReg)
.addReg(NewCondReg);
CondReg = AndReg;
@@ -6837,15 +6811,15 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
MRI.setSimpleHint(SaveExec, CondReg);
// Update EXEC to matching lanes, saving original to SaveExec.
- BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
+ BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
.addReg(CondReg, RegState::Kill);
// The original instruction is here; we insert the terminators after it.
I = BodyBB.end();
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
- BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
- .addReg(Exec)
+ BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
.addReg(SaveExec);
BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
@@ -6872,8 +6846,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
++End;
}
const DebugLoc &DL = MI.getDebugLoc();
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
// Save SCC. Waterfall Loop may overwrite SCC.
@@ -6895,7 +6868,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
// Save the EXEC mask
- BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
+ BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
// Killed uses in the instruction we are waterfalling around will be
// incorrect due to the added control-flow.
@@ -6956,7 +6929,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
}
// Restore the EXEC mask
- BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
+ BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
+ .addReg(SaveExec);
return BodyBB;
}
@@ -7651,12 +7625,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Clear unused bits of vcc
Register CondReg = Inst.getOperand(1).getReg();
bool IsSCC = CondReg == AMDGPU::SCC;
- Register VCC = RI.getVCC();
- Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
- .addReg(EXEC)
- .addReg(IsSCC ? VCC : CondReg);
+ const AMDGPU::LaneMaskConstants &LMC = AMDGPU::getLaneMaskConstants(&ST);
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
+ .addReg(LMC.ExecReg)
+ .addReg(IsSCC ? LMC.VccReg : CondReg);
Inst.removeOperand(1);
} break;
@@ -10110,9 +10082,7 @@ MachineInstr *SIInstrInfo::createPHISourceCopy(
InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
InsPt++;
return BuildMI(MBB, InsPt, DL,
- get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
- : AMDGPU::S_MOV_B64_term),
- Dst)
+ get(AMDGPU::getLaneMaskConstants(&ST).MovTermOpc), Dst)
.addReg(Src, 0, SrcSubReg)
.addReg(AMDGPU::EXEC, RegState::Implicit);
}
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index 3f7b0eab6bb8c..d77f0d381d64a 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
@@ -29,6 +30,7 @@ class SILateBranchLowering {
const SIRegisterInfo *TRI = nullptr;
const SIInstrInfo *TII = nullptr;
MachineDominatorTree *MDT = nullptr;
+ const AMDGPU::LaneMaskConstants *LMC = nullptr;
void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST,
bool DynamicVGPR);
@@ -38,9 +40,6 @@ class SILateBranchLowering {
SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {}
bool run(MachineFunction &MF);
-
- unsigned MovOpc;
- Register ExecReg;
};
class SILateBranchLoweringLegacy : public MachineFunctionPass {
@@ -165,17 +164,17 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI,
copyOpWithoutRegFlags(SelectCallee,
*TII->getNamedOperand(MI, AMDGPU::OpName::fbcallee));
- auto SelectExec = BuildMI(*MI.getParent(), MI, DL,
- TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
- : AMDGPU::S_CSELECT_B64))
- .addDef(ExecReg);
+ auto SelectExec =
+ BuildMI(*MI.getParent(), MI, DL, TII->get(LMC->CSelectOpc))
+ .addDef(LMC->ExecReg);
copyOpWithoutRegFlags(SelectExec,
*TII->getNamedOperand(MI, AMDGPU::OpName::exec));
copyOpWithoutRegFlags(SelectExec,
*TII->getNamedOperand(MI, AMDGPU::OpName::fbexec));
} else {
- auto SetExec = BuildMI(*MI.getParent(), MI, DL, TII->get(MovOpc), ExecReg);
+ auto SetExec =
+ BuildMI(*MI.getParent(), MI, DL, TII->get(LMC->MovOpc), LMC->ExecReg);
copyOpWithoutRegFlags(SetExec,
*TII->getNamedOperand(MI, AMDGPU::OpName::exec));
}
@@ -217,9 +216,7 @@ bool SILateBranchLowering::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
-
- MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ LMC = &AMDGPU::getLaneMaskConstants(&ST);
SmallVector<MachineInstr *, 4> EarlyTermInstrs;
SmallVector<MachineInstr *, 1> EpilogInstrs;
@@ -269,8 +266,8 @@ bool SILateBranchLowering::run(MachineFunction &MF) {
DebugLoc DL;
MF.insert(MF.end(), EarlyExitBlock);
- BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc),
- ExecReg)
+ BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(LMC->MovOpc),
+ LMC->ExecReg)
.addImm(0);
generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF);
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index e97536d36bab2..760956c5bd4c5 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -50,6 +50,7 @@
#include "SILowerControlFlow.h"
#include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallSet.h"
@@ -85,15 +86,7 @@ class SILowerControlFlow {
SmallSet<Register, 8> RecomputeRegs;
const TargetRegisterClass *BoolRC = nullptr;
- unsigned AndOpc;
- unsigned OrOpc;
- unsigned XorOpc;
- unsigned MovTermOpc;
- unsigned Andn2TermOpc;
- unsigned XorTermrOpc;
- unsigned OrTermrOpc;
- unsigned OrSaveExecOpc;
- unsigned Exec;
+ const AMDGPU::LaneMaskConstants &LMC;
bool EnableOptimizeEndCf = false;
@@ -139,9 +132,11 @@ class SILowerControlFlow {
void optimizeEndCf();
public:
- SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV,
- MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
- : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {}
+ SILowerControlFlow(const GCNSubtarget *ST, LiveIntervals *LIS,
+ LiveVariables *LV, MachineDominatorTree *MDT,
+ MachinePostDominatorTree *PDT)
+ : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT),
+ LMC(AMDGPU::getLaneMaskConstants(ST)) {}
bool run(MachineFunction &MF);
};
@@ -243,18 +238,15 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// will interfere with trying to form s_and_saveexec_b64 later.
Register CopyReg = SimpleIf ? SaveExecReg
: MRI->createVirtualRegister(BoolRC);
- MachineInstr *CopyExec =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
- .addReg(Exec)
- .addReg(Exec, RegState::ImplicitDefine);
+ MachineInstr *CopyExec = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
+ .addReg(LMC.ExecReg)
+ .addReg(LMC.ExecReg, RegState::ImplicitDefine);
LoweredIf.insert(CopyReg);
Register Tmp = MRI->createVirtualRegister(BoolRC);
MachineInstr *And =
- BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
- .addReg(CopyReg)
- .add(Cond);
+ BuildMI(MBB, I, DL, TII->get(LMC.AndOpc), Tmp).addReg(CopyReg).add(Cond);
if (LV)
LV->replaceKillInstruction(Cond.getReg(), MI, *And);
@@ -262,18 +254,17 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineInstr *Xor = nullptr;
if (!SimpleIf) {
- Xor =
- BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg)
- .addReg(Tmp)
- .addReg(CopyReg);
+ Xor = BuildMI(MBB, I, DL, TII->get(LMC.XorOpc), SaveExecReg)
+ .addReg(Tmp)
+ .addReg(CopyReg);
setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
}
// Use a copy that is a terminator to get correct spill code placement it with
// fast regalloc.
MachineInstr *SetExec =
- BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
- .addReg(Tmp, RegState::Kill);
+ BuildMI(MBB, I, DL, TII->get(LMC.MovTermOpc), LMC.ExecReg)
+ .addReg(Tmp, RegState::Kill);
if (LV)
LV->getVarInfo(Tmp).Kills.push_back(SetExec);
@@ -327,8 +318,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
// else.
Register SaveReg = MRI->createVirtualRegister(BoolRC);
MachineInstr *OrSaveExec =
- BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
- .add(MI.getOperand(1)); // Saved EXEC
+ BuildMI(MBB, Start, DL, TII->get(LMC.OrSaveExecOpc), SaveReg)
+ .add(MI.getOperand(1)); // Saved EXEC
if (LV)
LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);
@@ -338,14 +329,14 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
// This accounts for any modification of the EXEC mask within the block and
// can be optimized out pre-RA when not required.
- MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
- .addReg(Exec)
+ MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(LMC.AndOpc), DstReg)
+ .addReg(LMC.ExecReg)
.addReg(SaveReg);
MachineInstr *Xor =
- BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
- .addReg(Exec)
- .addReg(DstReg);
+ BuildMI(MBB, ElsePt, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
+ .addReg(DstReg);
// Skip ahead to the unconditional branch in case there are other terminators
// present.
@@ -400,16 +391,16 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
Register AndReg;
if (!SkipAnding) {
AndReg = MRI->createVirtualRegister(BoolRC);
- And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
- .addReg(Exec)
- .add(MI.getOperand(1));
+ And = BuildMI(MBB, &MI, DL, TII->get(LMC.AndOpc), AndReg)
+ .addReg(LMC.ExecReg)
+ .add(MI.getOperand(1));
if (LV)
LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *And);
- Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
+ Or = BuildMI(MBB, &MI, DL, TII->get(LMC.OrOpc), Dst)
.addReg(AndReg)
.add(MI.getOperand(2));
} else {
- Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
+ Or = BuildMI(MBB, &MI, DL, TII->get(LMC.OrOpc), Dst)
.add(MI.getOperand(1))
.add(MI.getOperand(2));
if (LV)
@@ -436,8 +427,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
const DebugLoc &DL = MI.getDebugLoc();
MachineInstr *AndN2 =
- BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
- .addReg(Exec)
+ BuildMI(MBB, &MI, DL, TII->get(LMC.AndN2TermOpc), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
.add(MI.getOperand(0));
if (LV)
LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *AndN2);
@@ -505,7 +496,7 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
}
}
- unsigned Opcode = OrOpc;
+ unsigned Opcode = LMC.OrOpc;
MachineBasicBlock *SplitBB = &MBB;
if (NeedBlockSplit) {
SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
@@ -522,14 +513,13 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
if (PDT)
PDT->applyUpdates(DTUpdates);
}
- Opcode = OrTermrOpc;
+ Opcode = LMC.OrTermOpc;
InsPt = MI;
}
- MachineInstr *NewMI =
- BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
- .addReg(Exec)
- .add(MI.getOperand(0));
+ MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(Opcode), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
+ .add(MI.getOperand(0));
if (LV) {
LV->replaceKillInstruction(DataReg, MI, *NewMI);
@@ -597,12 +587,12 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
// does not really modify exec.
for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
- !(I->isCopy() && I->getOperand(0).getReg() != Exec))
+ !(I->isCopy() && I->getOperand(0).getReg() != LMC.ExecReg))
return;
for (const auto &SrcOp : Def->explicit_operands())
if (SrcOp.isReg() && SrcOp.isUse() &&
- (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec))
+ (SrcOp.getReg().isVirtual() || SrcOp.getReg() == LMC.ExecReg))
Src.push_back(SrcOp);
}
@@ -781,28 +771,6 @@ bool SILowerControlFlow::run(MachineFunction &MF) {
MRI = &MF.getRegInfo();
BoolRC = TRI->getBoolRC();
- if (ST.isWave32()) {
- AndOpc = AMDGPU::S_AND_B32;
- OrOpc = AMDGPU::S_OR_B32;
- XorOpc = AMDGPU::S_XOR_B32;
- MovTermOpc = AMDGPU::S_MOV_B32_term;
- Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
- XorTermrOpc = AMDGPU::S_XOR_B32_term;
- OrTermrOpc = AMDGPU::S_OR_B32_term;
- OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
- Exec = AMDGPU::EXEC_LO;
- } else {
- AndOpc = AMDGPU::S_AND_B64;
- OrOpc = AMDGPU::S_OR_B64;
- XorOpc = AMDGPU::S_XOR_B64;
- MovTermOpc = AMDGPU::S_MOV_B64_term;
- Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
- XorTermrOpc = AMDGPU::S_XOR_B64_term;
- OrTermrOpc = AMDGPU::S_OR_B64_term;
- OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
- Exec = AMDGPU::EXEC;
- }
-
// Compute set of blocks with kills
const bool CanDemote =
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -876,6 +844,7 @@ bool SILowerControlFlow::run(MachineFunction &MF) {
}
bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
// This doesn't actually need LiveIntervals, but we can preserve them.
auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
@@ -888,12 +857,13 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) {
getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
MachinePostDominatorTree *PDT =
PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
- return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
+ return SILowerControlFlow(ST, LIS, LV, MDT, PDT).run(MF);
}
PreservedAnalyses
SILowerControlFlowPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
+ const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
LiveIntervals *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(MF);
LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF);
MachineDominatorTree *MDT =
@@ -901,7 +871,7 @@ SILowerControlFlowPass::run(MachineFunction &MF,
MachinePostDominatorTree *PDT =
MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
- bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF);
+ bool Changed = SILowerControlFlow(ST, LIS, LV, MDT, PDT).run(MF);
if (!Changed)
return PreservedAnalyses::all();
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 745e4086bc7fe..16525e3d476bb 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -8,6 +8,7 @@
#include "SIOptimizeExecMasking.h"
#include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIRegisterInfo.h"
@@ -25,12 +26,20 @@ using namespace llvm;
namespace {
class SIOptimizeExecMasking {
- MachineFunction *MF = nullptr;
- const GCNSubtarget *ST = nullptr;
- const SIRegisterInfo *TRI = nullptr;
- const SIInstrInfo *TII = nullptr;
- const MachineRegisterInfo *MRI = nullptr;
- MCRegister Exec;
+public:
+ SIOptimizeExecMasking(MachineFunction *MF)
+ : MF(MF), ST(&MF->getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
+ TRI(&TII->getRegisterInfo()), MRI(&MF->getRegInfo()),
+ LMC(AMDGPU::getLaneMaskConstants(ST)) {}
+ bool run();
+
+private:
+ MachineFunction *MF;
+ const GCNSubtarget *ST;
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ const AMDGPU::LaneMaskConstants &LMC;
DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
@@ -57,13 +66,10 @@ class SIOptimizeExecMasking {
bool optimizeExecSequence();
void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
- MachineInstr &VCmp, MCRegister Exec) const;
+ MachineInstr &VCmp) const;
void tryRecordOrSaveexecXorSequence(MachineInstr &MI);
bool optimizeOrSaveexecXorSequences();
-
-public:
- bool run(MachineFunction &MF);
};
class SIOptimizeExecMaskingLegacy : public MachineFunctionPass {
@@ -91,9 +97,9 @@ class SIOptimizeExecMaskingLegacy : public MachineFunctionPass {
PreservedAnalyses
SIOptimizeExecMaskingPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &) {
- SIOptimizeExecMasking Impl;
+ SIOptimizeExecMasking Impl(&MF);
- if (!Impl.run(MF))
+ if (!Impl.run())
return PreservedAnalyses::all();
auto PA = getMachineFunctionPassPreservedAnalyses();
@@ -120,7 +126,7 @@ Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B32_term: {
const MachineOperand &Src = MI.getOperand(1);
- if (Src.isReg() && Src.getReg() == Exec)
+ if (Src.isReg() && Src.getReg() == LMC.ExecReg)
return MI.getOperand(0).getReg();
}
}
@@ -135,7 +141,7 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
case AMDGPU::S_MOV_B64:
case AMDGPU::S_MOV_B32: {
const MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
+ if (Dst.isReg() && Dst.getReg() == LMC.ExecReg && MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
break;
}
@@ -471,7 +477,7 @@ bool SIOptimizeExecMasking::optimizeExecSequence() {
isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
- PrepareExecInst->getOperand(0).setReg(Exec);
+ PrepareExecInst->getOperand(0).setReg(LMC.ExecReg);
LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
@@ -496,7 +502,7 @@ bool SIOptimizeExecMasking::optimizeExecSequence() {
J = std::next(CopyFromExecInst->getIterator()),
JE = I->getIterator();
J != JE; ++J) {
- if (SaveExecInst && J->readsRegister(Exec, TRI)) {
+ if (SaveExecInst && J->readsRegister(LMC.ExecReg, TRI)) {
LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
// Make sure this is inserted after any VALU ops that may have been
// scheduled in between.
@@ -580,8 +586,8 @@ bool SIOptimizeExecMasking::optimizeExecSequence() {
CopyToExecInst->eraseFromParent();
for (MachineInstr *OtherInst : OtherUseInsts) {
- OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
- *TRI);
+ OtherInst->substituteRegister(CopyToExec, LMC.ExecReg,
+ AMDGPU::NoSubRegister, *TRI);
}
Changed = true;
@@ -593,7 +599,7 @@ bool SIOptimizeExecMasking::optimizeExecSequence() {
// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
// operands extracted from a v_cmp ..., s_and_saveexec pattern.
bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
- MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
+ MachineInstr &SaveExecInstr, MachineInstr &VCmp) const {
const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
if (NewOpcode == -1)
@@ -610,7 +616,7 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
- .addReg(Exec);
+ .addReg(LMC.ExecReg);
}
// Omit dst as V_CMPX is implicitly writing to EXEC.
@@ -661,10 +667,7 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
if (!ST->hasGFX10_3Insts())
return;
- const unsigned AndSaveExecOpcode =
- ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
-
- if (MI.getOpcode() != AndSaveExecOpcode)
+ if (MI.getOpcode() != LMC.AndSaveExecOpc)
return;
Register SaveExecDest = MI.getOperand(0).getReg();
@@ -690,7 +693,7 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
},
- {Exec, SaveExecSrc0->getReg()});
+ {LMC.ExecReg, SaveExecSrc0->getReg()});
if (!VCmp)
return;
@@ -748,32 +751,28 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
// to be replaced with
// s_andn2_saveexec s_o, s_i.
void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) {
- const unsigned XorOpcode =
- ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
-
- if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) {
+ if (MI.getOpcode() == LMC.XorOpc && &MI != &MI.getParent()->front()) {
const MachineOperand &XorDst = MI.getOperand(0);
const MachineOperand &XorSrc0 = MI.getOperand(1);
const MachineOperand &XorSrc1 = MI.getOperand(2);
- if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() &&
+ if (XorDst.isReg() && XorDst.getReg() == LMC.ExecReg && XorSrc0.isReg() &&
XorSrc1.isReg() &&
- (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) {
- const unsigned OrSaveexecOpcode = ST->isWave32()
- ? AMDGPU::S_OR_SAVEEXEC_B32
- : AMDGPU::S_OR_SAVEEXEC_B64;
+ (XorSrc0.getReg() == LMC.ExecReg || XorSrc1.getReg() == LMC.ExecReg)) {
// Peek at the previous instruction and check if this is a relevant
// s_or_saveexec instruction.
MachineInstr &PossibleOrSaveexec = *MI.getPrevNode();
- if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode)
+ if (PossibleOrSaveexec.getOpcode() != LMC.OrSaveExecOpc)
return;
const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0);
const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1);
if (OrDst.isReg() && OrSrc0.isReg()) {
- if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) ||
- (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) {
+ if ((XorSrc0.getReg() == LMC.ExecReg &&
+ XorSrc1.getReg() == OrDst.getReg()) ||
+ (XorSrc0.getReg() == OrDst.getReg() &&
+ XorSrc1.getReg() == LMC.ExecReg)) {
OrXors.emplace_back(&PossibleOrSaveexec, &MI);
}
}
@@ -787,15 +786,13 @@ bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
}
bool Changed = false;
- const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
- : AMDGPU::S_ANDN2_SAVEEXEC_B64;
for (const auto &Pair : OrXors) {
MachineInstr *Or = nullptr;
MachineInstr *Xor = nullptr;
std::tie(Or, Xor) = Pair;
BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(),
- TII->get(Andn2Opcode), Or->getOperand(0).getReg())
+ TII->get(LMC.AndN2SaveExecOpc), Or->getOperand(0).getReg())
.addReg(Or->getOperand(1).getReg());
Or->eraseFromParent();
@@ -811,24 +808,17 @@ bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
- return SIOptimizeExecMasking().run(MF);
+ return SIOptimizeExecMasking(&MF).run();
}
-bool SIOptimizeExecMasking::run(MachineFunction &MF) {
- this->MF = &MF;
- ST = &MF.getSubtarget<GCNSubtarget>();
- TRI = ST->getRegisterInfo();
- TII = ST->getInstrInfo();
- MRI = &MF.getRegInfo();
- Exec = TRI->getExec();
-
+bool SIOptimizeExecMasking::run() {
bool Changed = optimizeExecSequence();
OrXors.clear();
SaveExecVCmpMapping.clear();
KillFlagCandidates.clear();
static unsigned SearchWindow = 10;
- for (MachineBasicBlock &MBB : MF) {
+ for (MachineBasicBlock &MBB : *MF) {
unsigned SearchCount = 0;
for (auto &MI : llvm::reverse(MBB)) {
@@ -842,7 +832,7 @@ bool SIOptimizeExecMasking::run(MachineFunction &MF) {
tryRecordOrSaveexecXorSequence(MI);
tryRecordVCmpxAndSaveexecSequence(MI);
- if (MI.modifiesRegister(Exec, TRI)) {
+ if (MI.modifiesRegister(LMC.ExecReg, TRI)) {
break;
}
@@ -855,7 +845,7 @@ bool SIOptimizeExecMasking::run(MachineFunction &MF) {
MachineInstr *SaveExecInstr = Entry.getFirst();
MachineInstr *VCmpInstr = Entry.getSecond();
- Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
+ Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr);
}
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index b2228574378f1..f02aab40417a8 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -14,6 +14,7 @@
#include "SIOptimizeExecMaskingPreRA.h"
#include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -32,11 +33,8 @@ class SIOptimizeExecMaskingPreRA {
const SIInstrInfo *TII;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
+ const AMDGPU::LaneMaskConstants *LMC;
- unsigned AndOpc;
- unsigned Andn2Opc;
- unsigned OrSaveExecOpc;
- unsigned XorTermrOpc;
MCRegister CondReg;
MCRegister ExecReg;
@@ -138,8 +136,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
auto *And =
TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS);
- if (!And || And->getOpcode() != AndOpc ||
- !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
+ if (!And || And->getOpcode() != LMC->AndOpc || !And->getOperand(1).isReg() ||
+ !And->getOperand(2).isReg())
return false;
MachineOperand *AndCC = &And->getOperand(1);
@@ -207,7 +205,7 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
<< *And);
MachineInstr *Andn2 =
- BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
+ BuildMI(MBB, *And, And->getDebugLoc(), TII->get(LMC->AndN2Opc),
And->getOperand(0).getReg())
.addReg(ExecReg)
.addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
@@ -294,11 +292,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
// Check this is an else block.
auto First = MBB.begin();
MachineInstr &SaveExecMI = *First;
- if (SaveExecMI.getOpcode() != OrSaveExecOpc)
+ if (SaveExecMI.getOpcode() != LMC->OrSaveExecOpc)
return false;
auto I = llvm::find_if(MBB.terminators(), [this](const MachineInstr &MI) {
- return MI.getOpcode() == XorTermrOpc;
+ return MI.getOpcode() == LMC->XorTermOpc;
});
if (I == MBB.terminators().end())
return false;
@@ -314,7 +312,7 @@ bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
MachineInstr *AndExecMI = nullptr;
I--;
while (I != First && !AndExecMI) {
- if (I->getOpcode() == AndOpc && I->getOperand(0).getReg() == DstReg &&
+ if (I->getOpcode() == LMC->AndOpc && I->getOperand(0).getReg() == DstReg &&
I->getOperand(1).getReg() == Register(ExecReg))
AndExecMI = &*I;
I--;
@@ -371,14 +369,9 @@ bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) {
TII = ST.getInstrInfo();
MRI = &MF.getRegInfo();
- const bool Wave32 = ST.isWave32();
- AndOpc = Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
- Andn2Opc = Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
- OrSaveExecOpc =
- Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
- XorTermrOpc = Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
- CondReg = MCRegister::from(Wave32 ? AMDGPU::VCC_LO : AMDGPU::VCC);
- ExecReg = MCRegister::from(Wave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+ LMC = &AMDGPU::getLaneMaskConstants(&ST);
+ CondReg = MCRegister::from(LMC->VccReg);
+ ExecReg = MCRegister::from(LMC->ExecReg);
DenseSet<Register> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
bool Changed = false;
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 1198bbc310daa..8dfc116c92847 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -69,6 +69,7 @@
#include "SIWholeQuadMode.h"
#include "AMDGPU.h"
+#include "AMDGPULaneMaskUtils.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/MapVector.h"
@@ -155,7 +156,7 @@ class SIWholeQuadMode {
MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
: ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT),
- PDT(PDT) {}
+ PDT(PDT), LMC(AMDGPU::getLaneMaskConstants(ST)) {}
bool run(MachineFunction &MF);
private:
@@ -166,15 +167,8 @@ class SIWholeQuadMode {
LiveIntervals *LIS;
MachineDominatorTree *MDT;
MachinePostDominatorTree *PDT;
+ const AMDGPU::LaneMaskConstants &LMC;
- unsigned AndOpc;
- unsigned AndTermOpc;
- unsigned AndN2Opc;
- unsigned XorOpc;
- unsigned AndSaveExecOpc;
- unsigned AndSaveExecTermOpc;
- unsigned WQMOpc;
- Register Exec;
Register LiveMaskReg;
DenseMap<const MachineInstr *, InstrInfo> Instructions;
@@ -882,14 +876,12 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
const MachineOperand &Op1 = MI.getOperand(1);
// VCC represents lanes killed.
- Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
-
if (TRI->isVGPR(*MRI, Op0.getReg())) {
Opcode = AMDGPU::getVOPe32(Opcode);
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
} else {
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .addReg(VCC, RegState::Define)
+ .addReg(LMC.VccReg, RegState::Define)
.addImm(0) // src0 modifiers
.add(Op1)
.addImm(0) // src1 modifiers
@@ -898,9 +890,9 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
}
MachineInstr *MaskUpdateMI =
- BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
- .addReg(VCC);
+ .addReg(LMC.VccReg);
// State of SCC represents whether any lanes are live in mask,
// if SCC is 0 then no lanes will be alive anymore.
@@ -908,7 +900,9 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
MachineInstr *ExecMaskMI =
- BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
+ BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
+ .addReg(LMC.VccReg);
assert(MBB.succ_size() == 1);
@@ -942,9 +936,9 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
if (Op.isImm()) {
if (Op.getImm() == KillVal) {
// Static: all active lanes are killed
- MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
- .addReg(Exec);
+ .addReg(LMC.ExecReg);
} else {
// Static: kill does nothing
bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end();
@@ -964,14 +958,15 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
// Op represents live lanes after kill,
// so exec mask needs to be factored in.
TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
- ComputeKilledMaskMI =
- BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op);
- MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ ComputeKilledMaskMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), TmpReg)
+ .addReg(LMC.ExecReg)
+ .add(Op);
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
.addReg(TmpReg);
} else {
// Op represents lanes to kill
- MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(LMC.AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
.add(Op);
}
@@ -990,24 +985,25 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
if (IsDemote) {
// Demote - deactivate quads with only helper lanes
LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
- WQMMaskMI =
- BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
- NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
- .addReg(Exec)
+ WQMMaskMI = BuildMI(MBB, MI, DL, TII->get(LMC.WQMOpc), LiveMaskWQM)
+ .addReg(LiveMaskReg);
+ NewTerm = BuildMI(MBB, MI, DL, TII->get(LMC.AndOpc), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
.addReg(LiveMaskWQM);
} else {
// Kill - deactivate lanes no longer in live mask
if (Op.isImm()) {
- unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
+ NewTerm =
+ BuildMI(MBB, &MI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(0);
} else if (!IsWQM) {
- NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
- .addReg(Exec)
+ NewTerm = BuildMI(MBB, &MI, DL, TII->get(LMC.AndOpc), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
.addReg(LiveMaskReg);
} else {
- unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
- NewTerm =
- BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
+ unsigned Opcode = KillVal ? LMC.AndN2Opc : LMC.AndOpc;
+ NewTerm = BuildMI(MBB, &MI, DL, TII->get(Opcode), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
+ .add(Op);
}
}
@@ -1183,13 +1179,14 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineInstr *MI;
if (SaveWQM) {
- unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
+ unsigned Opcode =
+ IsTerminator ? LMC.AndSaveExecTermOpc : LMC.AndSaveExecOpc;
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
.addReg(LiveMaskReg);
} else {
- unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
- .addReg(Exec)
+ unsigned Opcode = IsTerminator ? LMC.AndTermOpc : LMC.AndOpc;
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), LMC.ExecReg)
+ .addReg(LMC.ExecReg)
.addReg(LiveMaskReg);
}
@@ -1203,10 +1200,11 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
MachineInstr *MI;
if (SavedWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), LMC.ExecReg)
.addReg(SavedWQM);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg)
+ .addReg(LMC.ExecReg);
}
LIS->InsertMachineInstrInMaps(*MI);
@@ -1246,11 +1244,11 @@ void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
if (CurrentStrictState == StateStrictWWM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
- Exec)
+ LMC.ExecReg)
.addReg(SavedOrig);
} else {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
- Exec)
+ LMC.ExecReg)
.addReg(SavedOrig);
}
LIS->InsertMachineInstrInMaps(*MI);
@@ -1280,7 +1278,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI,
if (IsEntry) {
// Skip the instruction that saves LiveMask
if (II != IE && II->getOpcode() == AMDGPU::COPY &&
- II->getOperand(1).getReg() == TRI->getExec())
+ II->getOperand(1).getReg() == LMC.ExecReg)
++II;
}
@@ -1565,18 +1563,14 @@ bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
- bool IsWave32 = ST->isWave32();
if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
assert(MBB == &MBB->getParent()->front() &&
"init whole wave not in entry block");
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
- MachineInstr *SaveExec =
- BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
- TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
- : AMDGPU::S_OR_SAVEEXEC_B64),
- EntryExec)
- .addImm(-1);
+ MachineInstr *SaveExec = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ TII->get(LMC.OrSaveExecOpc), EntryExec)
+ .addImm(-1);
// Replace all uses of MI's destination reg with EntryExec.
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
@@ -1596,11 +1590,9 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
- MachineInstr *InitMI =
- BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
- TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
- Exec)
- .addImm(MI.getOperand(0).getImm());
+ MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ TII->get(LMC.MovOpc), LMC.ExecReg)
+ .addImm(MI.getOperand(0).getImm());
if (LIS) {
LIS->RemoveMachineInstrFromMaps(MI);
LIS->InsertMachineInstrInMaps(*InitMI);
@@ -1644,19 +1636,14 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
.addReg(InputReg)
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
- auto BfmMI =
- BuildMI(*MBB, FirstMI, DL,
- TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
- .addReg(CountReg)
- .addImm(0);
+ auto BfmMI = BuildMI(*MBB, FirstMI, DL, TII->get(LMC.BfmOpc), LMC.ExecReg)
+ .addReg(CountReg)
+ .addImm(0);
auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
.addReg(CountReg, RegState::Kill)
.addImm(WavefrontSize);
auto CmovMI =
- BuildMI(*MBB, FirstMI, DL,
- TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
- Exec)
- .addImm(-1);
+ BuildMI(*MBB, FirstMI, DL, TII->get(LMC.CMovOpc), LMC.ExecReg).addImm(-1);
if (!LIS) {
MI.eraseFromParent();
@@ -1711,30 +1698,10 @@ bool SIWholeQuadMode::run(MachineFunction &MF) {
SetInactiveInstrs.clear();
StateTransition.clear();
- if (ST->isWave32()) {
- AndOpc = AMDGPU::S_AND_B32;
- AndTermOpc = AMDGPU::S_AND_B32_term;
- AndN2Opc = AMDGPU::S_ANDN2_B32;
- XorOpc = AMDGPU::S_XOR_B32;
- AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
- AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
- WQMOpc = AMDGPU::S_WQM_B32;
- Exec = AMDGPU::EXEC_LO;
- } else {
- AndOpc = AMDGPU::S_AND_B64;
- AndTermOpc = AMDGPU::S_AND_B64_term;
- AndN2Opc = AMDGPU::S_ANDN2_B64;
- XorOpc = AMDGPU::S_XOR_B64;
- AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
- AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
- WQMOpc = AMDGPU::S_WQM_B64;
- Exec = AMDGPU::EXEC;
- }
-
const char GlobalFlags = analyzeFunction(MF);
bool Changed = false;
- LiveMaskReg = Exec;
+ LiveMaskReg = LMC.ExecReg;
MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
@@ -1748,7 +1715,7 @@ bool SIWholeQuadMode::run(MachineFunction &MF) {
LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
MachineInstr *MI =
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
- .addReg(Exec);
+ .addReg(LMC.ExecReg);
LIS->InsertMachineInstrInMaps(*MI);
Changed = true;
}
@@ -1779,8 +1746,9 @@ bool SIWholeQuadMode::run(MachineFunction &MF) {
Changed |= lowerKillInstrs(false);
} else if (GlobalFlags == StateWQM) {
// Shader only needs WQM
- auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
- .addReg(Exec);
+ auto MI =
+ BuildMI(Entry, EntryMI, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg)
+ .addReg(LMC.ExecReg);
LIS->InsertMachineInstrInMaps(*MI);
lowerKillInstrs(true);
Changed = true;
@@ -1798,7 +1766,7 @@ bool SIWholeQuadMode::run(MachineFunction &MF) {
}
// Compute live range for live mask
- if (LiveMaskReg != Exec)
+ if (LiveMaskReg != LMC.ExecReg)
LIS->createAndComputeVirtRegInterval(LiveMaskReg);
// Physical registers like SCC aren't tracked by default anyway, so just
More information about the llvm-commits
mailing list