[llvm] [AMDGPU] Add option to bias SGPR allocation to reduce read hazards (PR #129869)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 5 17:41:15 PST 2025
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/129869
>From b293fd31eb14fe3bddb011387427b0f908173c04 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 5 Dec 2024 16:10:35 +0900
Subject: [PATCH 1/4] [AMDGPU] Add option to bias SGPR allocation to reduce
read hazards
- Scan for potential hazards in virtual registers before SGPR allocation.
- Use this data to build new allocation order via allocation hints.
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 3 +
.../AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp | 102 ++++++++
.../Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h | 25 ++
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/SIDefines.h | 1 +
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 170 ++++++++++++-
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 7 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 +
.../CodeGen/AMDGPU/sgpr-hazard-realloc.ll | 229 ++++++++++++++++++
10 files changed, 544 insertions(+), 1 deletion(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 57297288eecb4..ac3d683ffeea2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -478,6 +478,9 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
+void initializeAMDGPUMarkSGPRHazardRegsLegacyPass(PassRegistry &);
+extern char &AMDGPUMarkSGPRHazardRegsLegacyID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
new file mode 100644
index 0000000000000..46dfcbb48e54f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.cpp
@@ -0,0 +1,102 @@
+//===- AMDGPUMarkSGPRHazardRegs.cpp - Annotate SGPRs used by VALU ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to mark SGPRs used by VALU.
+/// Marks can be used during register allocation to reduce hazards.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMarkSGPRHazardRegs.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-mark-sgpr-hazard-regs"
+
+namespace {
+
+class AMDGPUMarkSGPRHazardRegs {
+public:
+ AMDGPUMarkSGPRHazardRegs() {}
+ bool run(MachineFunction &MF);
+};
+
+class AMDGPUMarkSGPRHazardRegsLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUMarkSGPRHazardRegsLegacy() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ return AMDGPUMarkSGPRHazardRegs().run(MF);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+bool AMDGPUMarkSGPRHazardRegs::run(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasVALUReadSGPRHazard())
+ return false;
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ if (!TRI->getSGPRHazardAvoidanceStrategy(MF))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "AMDGPUMarkSGPRHazardRegs: function " << MF.getName()
+ << "\n");
+
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
+ for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+ Register Reg = Register::index2VirtReg(I);
+ if (MRI->reg_nodbg_empty(Reg))
+ continue;
+ const auto *RC = MRI->getRegClass(Reg);
+ if (!RC || !TRI->isSGPRClass(RC))
+ continue;
+ for (const auto &MO : MRI->reg_nodbg_operands(Reg)) {
+ const MachineInstr &MI = *MO.getParent();
+ if (SIInstrInfo::isVALU(MI) && MO.isUse()) {
+ FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
+ break;
+ }
+ }
+ }
+
+ return true;
+}
+
+INITIALIZE_PASS(AMDGPUMarkSGPRHazardRegsLegacy, DEBUG_TYPE,
+ "AMDGPU Mark Hazard SGPRs", false, false)
+
+char AMDGPUMarkSGPRHazardRegsLegacy::ID = 0;
+
+char &llvm::AMDGPUMarkSGPRHazardRegsLegacyID =
+ AMDGPUMarkSGPRHazardRegsLegacy::ID;
+
+PreservedAnalyses
+AMDGPUMarkSGPRHazardRegsPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ AMDGPUMarkSGPRHazardRegs().run(MF);
+ return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
new file mode 100644
index 0000000000000..89905ceb1185d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkSGPRHazardRegs.h
@@ -0,0 +1,25 @@
+//===--- AMDGPUMarkSGPRHazardRegs.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPUMarkSGPRHazardRegsPass
+ : public PassInfoMixin<AMDGPUMarkSGPRHazardRegsPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMARKSGPRHAZARDSREGS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce3dcd920bce3..889006b1086c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -22,6 +22,7 @@
#include "AMDGPUIGroupLP.h"
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPUMacroFusion.h"
+#include "AMDGPUMarkSGPRHazardRegs.h"
#include "AMDGPUOpenCLEnqueuedBlockLowering.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPURemoveIncompatibleFunctions.h"
@@ -560,6 +561,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeGCNRegPressurePrinterPass(*PR);
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
+ initializeAMDGPUMarkSGPRHazardRegsLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -1613,6 +1615,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
addPass(&GCNPreRALongBranchRegID);
+ addPass(&AMDGPUMarkSGPRHazardRegsLegacyID);
addPass(createSGPRAllocPass(true));
// Commit allocated register changes. This is mostly necessary because too
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 408da0536237e..83de8ac1c7535 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -85,6 +85,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
+ AMDGPUMarkSGPRHazardRegs.cpp
AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp
AMDGPUPreLegalizerCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 721601efcc804..bf91118ec148a 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1066,6 +1066,7 @@ namespace VirtRegFlag {
enum Register_Flag : uint8_t {
// Register operand in a whole-wave mode operation.
WWM_REG = 1 << 0,
+ SGPR_HAZARD_REG = 1 << 1
};
} // namespace VirtRegFlag
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 128cd8244a477..d210d701b9306 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -11,14 +11,15 @@
//
//===----------------------------------------------------------------------===//
+#include "SIRegisterInfo.h"
#include "AMDGPU.h"
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -35,6 +36,10 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
cl::ReallyHidden,
cl::init(true));
+static cl::opt<unsigned> SGPRHazardAvoidanceStrategy(
+ "amdgpu-sgpr-hazard-regalloc", cl::init(0), cl::ReallyHidden,
+ cl::desc("Register allocation strategy to reduce SGPR read hazards"));
+
std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
@@ -3904,3 +3909,166 @@ SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
RegFlags.push_back("WWM_REG");
return RegFlags;
}
+
+unsigned SIRegisterInfo::getSGPRHazardAvoidanceStrategy(
+ const MachineFunction &MF) const {
+ if (SGPRHazardAvoidanceStrategy.getNumOccurrences()) {
+ return SGPRHazardAvoidanceStrategy;
+ } else {
+ return MF.getFunction().getFnAttributeAsParsedInteger(
+ "amdgpu-sgpr-hazard-regalloc", 0);
+ }
+}
+
+bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+ bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+ VirtReg, Order, Hints, MF, VRM, Matrix);
+ if (!VRM)
+ return BaseImplRetVal;
+
+ // Only use hinting to reduce SGPR read hazards when required.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasVALUReadSGPRHazard())
+ return BaseImplRetVal;
+
+ // Only treat SGPRs
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const auto *RC = MRI->getRegClass(VirtReg);
+ if (!isSGPRClass(RC))
+ return BaseImplRetVal;
+
+ const unsigned Strategy = getSGPRHazardAvoidanceStrategy(MF);
+ if (!Strategy)
+ return BaseImplRetVal;
+
+ SmallSet<MCPhysReg, 4> CopyHints;
+ CopyHints.insert(Hints.begin(), Hints.end());
+
+ auto AddHint = [&](MCPhysReg PhysReg) {
+ if (CopyHints.contains(PhysReg) || MRI->isReserved(PhysReg))
+ return;
+ Hints.push_back(PhysReg);
+ };
+ auto AddHints = [&](ArrayRef<MCPhysReg> Regs) {
+ for (MCPhysReg PhysReg : Regs)
+ AddHint(PhysReg);
+ };
+
+ // V1: simply reverse allocation order, mean 23% reduction in hazards
+ if (Strategy == 1) {
+ if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ for (MCPhysReg PhysReg : reverse(Order))
+ AddHint(PhysReg);
+ } else {
+ for (MCPhysReg PhysReg : Order)
+ AddHint(PhysReg);
+ }
+ return true;
+ }
+
+ // Build set of current hazard pairs from live matrix
+ auto *LiveUnions = const_cast<LiveRegMatrix *>(Matrix)->getLiveUnions();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+
+ DenseMap<MCPhysReg, unsigned> IntervalCount;
+ std::bitset<64> HazardPairs;
+
+ for (MCPhysReg PhysReg : Order) {
+ SmallSet<const LiveInterval *, 4> Intervals;
+ bool IsHazard = false;
+ for (auto Unit : TRI->regunits(PhysReg)) {
+ LiveIntervalUnion &LIU = LiveUnions[Unit];
+ for (const LiveInterval *LI : LIU.getMap()) {
+ Intervals.insert(LI);
+ if (FuncInfo->checkFlag(LI->reg(),
+ AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ IsHazard = true;
+ // Break here as we only care about interval count for non-hazard regs
+ break;
+ }
+ }
+ if (IsHazard)
+ break;
+ }
+ if (IsHazard) {
+ unsigned PairN = TRI->getEncodingValue(PhysReg) >> 1;
+ if (PairN <= 63)
+ HazardPairs.set(PairN);
+ }
+ IntervalCount[PhysReg] = Intervals.size();
+ }
+
+ // V2: weight the entire order based on hazard free usage, mean 30% reduction
+ // in hazards
+ if (Strategy == 2) {
+ bool VRegIsHazard =
+ FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG);
+ SmallVector<MCPhysReg> NewOrder(Order);
+ std::sort(NewOrder.begin(), NewOrder.end(), [&](MCPhysReg A, MCPhysReg B) {
+ return VRegIsHazard ? IntervalCount[A] < IntervalCount[B]
+ : IntervalCount[B] < IntervalCount[A];
+ });
+ AddHints(NewOrder);
+ return true;
+ }
+
+ // V3: complex partitioning, mean 35% reduction in hazards
+ assert(Strategy == 3);
+
+ // Partition the allocation order based on hazards
+ SmallVector<MCPhysReg> Unallocated, UnallocatedWithHazard;
+ SmallVector<MCPhysReg> Allocated, AllocatedWithHazard;
+
+ for (MCPhysReg PhysReg : Order) {
+ Register VReg = Matrix->getOneVReg(PhysReg);
+ bool HasHazard = false;
+ // XXX: can remove regunit scan for just SGPR32/SGPR64
+ for (auto Unit : TRI->regunits(PhysReg)) {
+ unsigned PairN = TRI->getEncodingValue(Unit) >> 1;
+ if (PairN <= 63 && HazardPairs[PairN]) {
+ HasHazard = true;
+ break;
+ }
+ }
+ if (VReg == MCRegister::NoRegister) {
+ if (HasHazard)
+ UnallocatedWithHazard.push_back(PhysReg);
+ else
+ Unallocated.push_back(PhysReg);
+ } else {
+ if (HasHazard)
+ AllocatedWithHazard.push_back(PhysReg);
+ else
+ Allocated.push_back(PhysReg);
+ }
+ }
+
+ if (FuncInfo->checkFlag(VirtReg, AMDGPU::VirtRegFlag::SGPR_HAZARD_REG)) {
+ // Reorder allocations based on usage, so least used will be reused first.
+ // This means least used regs are touched by hazards first.
+ std::sort(Allocated.begin(), Allocated.end(),
+ [&](MCPhysReg A, MCPhysReg B) {
+ return IntervalCount[A] < IntervalCount[B];
+ });
+ // Reverse order of allocations to try to keep hazards away - yes it helps.
+ std::reverse(Unallocated.begin(), Unallocated.end());
+
+ AddHints(AllocatedWithHazard);
+ AddHints(UnallocatedWithHazard);
+ AddHints(Unallocated);
+ AddHints(Allocated);
+ } else {
+ AddHints(Allocated);
+ AddHints(Unallocated);
+ AddHints(UnallocatedWithHazard);
+ AddHints(AllocatedWithHazard);
+ }
+
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a64180daea2ad..5ccede9ca3529 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -465,6 +465,13 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
SmallVector<StringLiteral>
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
+
+ unsigned getSGPRHazardAvoidanceStrategy(const MachineFunction &MF) const;
+
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
};
namespace AMDGPU {
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index d7f54f3b8e9e2..b933286b8c6ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -357,6 +357,7 @@
; GCN-O1-NEXT: SI Whole Quad Mode
; GCN-O1-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O1-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Machine Block Frequency Analysis
; GCN-O1-NEXT: Debug Variable Analysis
@@ -670,6 +671,7 @@
; GCN-O1-OPTS-NEXT: SI Whole Quad Mode
; GCN-O1-OPTS-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O1-OPTS-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Debug Variable Analysis
@@ -989,6 +991,7 @@
; GCN-O2-NEXT: SI optimize exec mask operations pre-RA
; GCN-O2-NEXT: SI Form memory clauses
; GCN-O2-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O2-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Machine Block Frequency Analysis
; GCN-O2-NEXT: Debug Variable Analysis
@@ -1321,6 +1324,7 @@
; GCN-O3-NEXT: SI optimize exec mask operations pre-RA
; GCN-O3-NEXT: SI Form memory clauses
; GCN-O3-NEXT: AMDGPU Pre-RA Long Branch Reg
+; GCN-O3-NEXT: AMDGPU Mark Hazard SGPRs
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Machine Block Frequency Analysis
; GCN-O3-NEXT: Debug Variable Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
new file mode 100644
index 0000000000000..91b06588aa2cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=0 < %s | FileCheck -check-prefix DEF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=1 < %s | FileCheck -check-prefix V1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=2 < %s | FileCheck -check-prefix V2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=3 < %s | FileCheck -check-prefix V3 %s
+
+define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float %d, ptr addrspace(1) %out) {
+; DEF-LABEL: fadd_f32:
+; DEF: ; %bb.0: ; %entry
+; DEF-NEXT: s_add_f32 s3, s0, s1
+; DEF-NEXT: s_sub_f32 s1, s0, s1
+; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DEF-NEXT: v_dual_add_f32 v0, s3, v0 :: v_dual_add_f32 v1, s1, v1
+; DEF-NEXT: v_readfirstlane_b32 s0, v0
+; DEF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; DEF-NEXT: v_readfirstlane_b32 s4, v1
+; DEF-NEXT: v_mul_f32_e32 v4, v0, v1
+; DEF-NEXT: s_and_b32 s0, s0, s4
+; DEF-NEXT: global_store_b32 v[2:3], v4, off
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_cmp_lg_u32 s0, 0
+; DEF-NEXT: s_mov_b32 s0, 0
+; DEF-NEXT: s_cbranch_scc0 .LBB0_5
+; DEF-NEXT: ; %bb.1: ; %false
+; DEF-NEXT: s_mov_b32 s2, exec_lo
+; DEF-NEXT: v_add_f32_e32 v0, v0, v1
+; DEF-NEXT: s_buffer_load_b32 s4, s[0:3], 0x0
+; DEF-NEXT: s_and_b32 s1, s3, s1
+; DEF-NEXT: s_wait_kmcnt 0x0
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_lshl_b32 s1, s4, s1
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: v_cmp_ne_u32_e32 vcc_lo, s1, v1
+; DEF-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_and_not1_b32 s2, s2, s1
+; DEF-NEXT: s_cbranch_scc0 .LBB0_6
+; DEF-NEXT: ; %bb.2: ; %false
+; DEF-NEXT: s_wait_alu 0xfffe
+; DEF-NEXT: s_and_b32 exec_lo, exec_lo, s2
+; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DEF-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; DEF-NEXT: s_cbranch_vccnz .LBB0_4
+; DEF-NEXT: .LBB0_3: ; %true
+; DEF-NEXT: v_mul_f32_e32 v0, v1, v4
+; DEF-NEXT: .LBB0_4: ; %final
+; DEF-NEXT: s_branch .LBB0_7
+; DEF-NEXT: .LBB0_5:
+; DEF-NEXT: ; implicit-def: $vgpr0
+; DEF-NEXT: s_branch .LBB0_3
+; DEF-NEXT: .LBB0_6:
+; DEF-NEXT: s_mov_b32 exec_lo, 0
+; DEF-NEXT: export mrt0 off, off, off, off done
+; DEF-NEXT: s_endpgm
+; DEF-NEXT: .LBB0_7:
+;
+; V1-LABEL: fadd_f32:
+; V1: ; %bb.0: ; %entry
+; V1-NEXT: s_add_f32 s104, s0, s1
+; V1-NEXT: s_sub_f32 s103, s0, s1
+; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; V1-NEXT: v_dual_add_f32 v0, s104, v0 :: v_dual_add_f32 v1, s103, v1
+; V1-NEXT: v_readfirstlane_b32 s0, v0
+; V1-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; V1-NEXT: v_readfirstlane_b32 s1, v1
+; V1-NEXT: v_mul_f32_e32 v4, v0, v1
+; V1-NEXT: s_and_b32 s0, s0, s1
+; V1-NEXT: global_store_b32 v[2:3], v4, off
+; V1-NEXT: s_cmp_lg_u32 s0, 0
+; V1-NEXT: s_mov_b32 s0, 0
+; V1-NEXT: s_cbranch_scc0 .LBB0_5
+; V1-NEXT: ; %bb.1: ; %false
+; V1-NEXT: s_mov_b32 s2, exec_lo
+; V1-NEXT: v_add_f32_e32 v0, v0, v1
+; V1-NEXT: s_buffer_load_b32 s1, s[0:3], 0x0
+; V1-NEXT: s_and_b32 s3, s104, s103
+; V1-NEXT: s_wait_kmcnt 0x0
+; V1-NEXT: s_lshl_b32 vcc_hi, s1, s3
+; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; V1-NEXT: v_cmp_ne_u32_e32 vcc_lo, vcc_hi, v1
+; V1-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
+; V1-NEXT: s_and_not1_b32 s2, s2, s1
+; V1-NEXT: s_cbranch_scc0 .LBB0_6
+; V1-NEXT: ; %bb.2: ; %false
+; V1-NEXT: s_and_b32 exec_lo, exec_lo, s2
+; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; V1-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; V1-NEXT: s_wait_alu 0xfffe
+; V1-NEXT: s_cbranch_vccnz .LBB0_4
+; V1-NEXT: .LBB0_3: ; %true
+; V1-NEXT: v_mul_f32_e32 v0, v1, v4
+; V1-NEXT: .LBB0_4: ; %final
+; V1-NEXT: s_branch .LBB0_7
+; V1-NEXT: .LBB0_5:
+; V1-NEXT: ; implicit-def: $vgpr0
+; V1-NEXT: s_branch .LBB0_3
+; V1-NEXT: .LBB0_6:
+; V1-NEXT: s_mov_b32 exec_lo, 0
+; V1-NEXT: export mrt0 off, off, off, off done
+; V1-NEXT: s_endpgm
+; V1-NEXT: .LBB0_7:
+;
+; V2-LABEL: fadd_f32:
+; V2: ; %bb.0: ; %entry
+; V2-NEXT: s_add_f32 s66, s0, s1
+; V2-NEXT: s_sub_f32 s65, s0, s1
+; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; V2-NEXT: v_dual_add_f32 v0, s66, v0 :: v_dual_add_f32 v1, s65, v1
+; V2-NEXT: v_readfirstlane_b32 s1, v0
+; V2-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; V2-NEXT: v_readfirstlane_b32 vcc_lo, v1
+; V2-NEXT: v_mul_f32_e32 v4, v0, v1
+; V2-NEXT: s_and_b32 s1, s1, vcc_lo
+; V2-NEXT: global_store_b32 v[2:3], v4, off
+; V2-NEXT: s_cmp_lg_u32 s1, 0
+; V2-NEXT: s_mov_b32 s1, 0
+; V2-NEXT: s_cbranch_scc0 .LBB0_5
+; V2-NEXT: ; %bb.1: ; %false
+; V2-NEXT: s_buffer_load_b32 vcc_lo, s[0:3], 0x0
+; V2-NEXT: s_and_b32 s66, s66, s65
+; V2-NEXT: v_add_f32_e32 v0, v0, v1
+; V2-NEXT: s_mov_b32 s79, exec_lo
+; V2-NEXT: s_wait_kmcnt 0x0
+; V2-NEXT: s_wait_alu 0xfffe
+; V2-NEXT: s_lshl_b32 s67, vcc_lo, s66
+; V2-NEXT: s_wait_alu 0xfffe
+; V2-NEXT: v_cmp_ne_u32_e32 vcc_lo, s67, v1
+; V2-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo
+; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; V2-NEXT: s_and_not1_b32 s79, s79, vcc_lo
+; V2-NEXT: s_cbranch_scc0 .LBB0_6
+; V2-NEXT: ; %bb.2: ; %false
+; V2-NEXT: s_and_b32 exec_lo, exec_lo, s79
+; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; V2-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
+; V2-NEXT: s_cbranch_vccnz .LBB0_4
+; V2-NEXT: .LBB0_3: ; %true
+; V2-NEXT: v_mul_f32_e32 v0, v1, v4
+; V2-NEXT: .LBB0_4: ; %final
+; V2-NEXT: s_branch .LBB0_7
+; V2-NEXT: .LBB0_5:
+; V2-NEXT: ; implicit-def: $vgpr0
+; V2-NEXT: s_branch .LBB0_3
+; V2-NEXT: .LBB0_6:
+; V2-NEXT: s_mov_b32 exec_lo, 0
+; V2-NEXT: export mrt0 off, off, off, off done
+; V2-NEXT: s_endpgm
+; V2-NEXT: .LBB0_7:
+;
+; V3-LABEL: fadd_f32:
+; V3: ; %bb.0: ; %entry
+; V3-NEXT: s_add_f32 s104, s0, s1
+; V3-NEXT: s_sub_f32 s82, s0, s1
+; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; V3-NEXT: v_dual_add_f32 v0, s104, v0 :: v_dual_add_f32 v1, s82, v1
+; V3-NEXT: v_readfirstlane_b32 s0, v0
+; V3-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; V3-NEXT: v_readfirstlane_b32 s1, v1
+; V3-NEXT: v_mul_f32_e32 v4, v0, v1
+; V3-NEXT: s_and_b32 s0, s0, s1
+; V3-NEXT: global_store_b32 v[2:3], v4, off
+; V3-NEXT: s_cmp_lg_u32 s0, 0
+; V3-NEXT: s_mov_b32 s0, 0
+; V3-NEXT: s_cbranch_scc0 .LBB0_5
+; V3-NEXT: ; %bb.1: ; %false
+; V3-NEXT: s_mov_b32 s2, exec_lo
+; V3-NEXT: s_and_b32 s104, s104, s82
+; V3-NEXT: s_buffer_load_b32 s1, s[0:3], 0x0
+; V3-NEXT: v_add_f32_e32 v0, v0, v1
+; V3-NEXT: s_wait_kmcnt 0x0
+; V3-NEXT: s_lshl_b32 s82, s1, s104
+; V3-NEXT: s_wait_alu 0xfffe
+; V3-NEXT: v_cmp_ne_u32_e32 vcc_lo, s82, v1
+; V3-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
+; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; V3-NEXT: s_and_not1_b32 s2, s2, s1
+; V3-NEXT: s_cbranch_scc0 .LBB0_6
+; V3-NEXT: ; %bb.2: ; %false
+; V3-NEXT: s_and_b32 exec_lo, exec_lo, s2
+; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; V3-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; V3-NEXT: s_cbranch_vccnz .LBB0_4
+; V3-NEXT: .LBB0_3: ; %true
+; V3-NEXT: v_mul_f32_e32 v0, v1, v4
+; V3-NEXT: .LBB0_4: ; %final
+; V3-NEXT: s_branch .LBB0_7
+; V3-NEXT: .LBB0_5:
+; V3-NEXT: ; implicit-def: $vgpr0
+; V3-NEXT: s_branch .LBB0_3
+; V3-NEXT: .LBB0_6:
+; V3-NEXT: s_mov_b32 exec_lo, 0
+; V3-NEXT: export mrt0 off, off, off, off done
+; V3-NEXT: s_endpgm
+; V3-NEXT: .LBB0_7:
+entry:
+ %s.0 = fadd float %a, %b
+ %s.1 = fsub float %a, %b
+ %v.0 = fadd float %c, %s.0
+ %v.1 = fadd float %d, %s.1
+ %v.2 = fmul float %v.0, %v.1
+ store float %v.2, ptr addrspace(1) %out
+ %tmp.0 = bitcast float %v.0 to i32
+ %tmp.1 = bitcast float %v.1 to i32
+ %tmp.2 = bitcast float %s.0 to i32
+ %tmp.3 = bitcast float %s.1 to i32
+ %s.3 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %tmp.0)
+ %s.4 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %tmp.1)
+ %s.5 = and i32 %s.3, %s.4
+ %s.6 = and i32 %tmp.2, %tmp.3
+ %c.0 = icmp eq i32 %s.5, 0
+ br i1 %c.0, label %true, label %false
+true:
+ %v.3 = fmul float %v.1, %v.2
+ br label %final
+false:
+ %v.4 = fadd float %v.0, %v.1
+ %s.7 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0)
+ %s.8 = shl i32 %s.7, %s.6
+ %c.1 = icmp ne i32 %tmp.1, %s.8
+ call void @llvm.amdgcn.wqm.demote(i1 %c.1)
+ br label %final
+final:
+ %res = phi float [ %v.4, %false ], [ %v.3, %true ]
+ ret float %res
+}
+
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
+declare void @llvm.amdgcn.wqm.demote(i1)
>From 378065404542c3693cb6acd3d273dbde3e79b620 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Wed, 5 Mar 2025 20:32:54 +0900
Subject: [PATCH 2/4] - Remove undef from lit test
---
.../CodeGen/AMDGPU/sgpr-hazard-realloc.ll | 89 +++++++++++--------
1 file changed, 51 insertions(+), 38 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
index 91b06588aa2cc..1b4bf75c6cc01 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-hazard-realloc.ll
@@ -4,40 +4,43 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=2 < %s | FileCheck -check-prefix V2 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-sgpr-hazard-regalloc=3 < %s | FileCheck -check-prefix V3 %s
-define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float %d, ptr addrspace(1) %out) {
+define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float %d, ptr addrspace(1) %out, <4 x i32> inreg %desc) {
; DEF-LABEL: fadd_f32:
; DEF: ; %bb.0: ; %entry
-; DEF-NEXT: s_add_f32 s3, s0, s1
+; DEF-NEXT: s_mov_b32 s6, s4
+; DEF-NEXT: s_mov_b32 s4, s2
+; DEF-NEXT: s_add_f32 s2, s0, s1
; DEF-NEXT: s_sub_f32 s1, s0, s1
-; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; DEF-NEXT: v_dual_add_f32 v0, s3, v0 :: v_dual_add_f32 v1, s1, v1
+; DEF-NEXT: s_mov_b32 s7, s5
+; DEF-NEXT: s_mov_b32 s5, s3
+; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DEF-NEXT: v_dual_add_f32 v0, s2, v0 :: v_dual_add_f32 v1, s1, v1
; DEF-NEXT: v_readfirstlane_b32 s0, v0
; DEF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; DEF-NEXT: v_readfirstlane_b32 s4, v1
+; DEF-NEXT: v_readfirstlane_b32 s3, v1
; DEF-NEXT: v_mul_f32_e32 v4, v0, v1
-; DEF-NEXT: s_and_b32 s0, s0, s4
+; DEF-NEXT: s_and_b32 s0, s0, s3
; DEF-NEXT: global_store_b32 v[2:3], v4, off
; DEF-NEXT: s_wait_alu 0xfffe
; DEF-NEXT: s_cmp_lg_u32 s0, 0
; DEF-NEXT: s_mov_b32 s0, 0
; DEF-NEXT: s_cbranch_scc0 .LBB0_5
; DEF-NEXT: ; %bb.1: ; %false
-; DEF-NEXT: s_mov_b32 s2, exec_lo
+; DEF-NEXT: s_buffer_load_b32 s3, s[4:7], 0x0
+; DEF-NEXT: s_and_b32 s1, s2, s1
; DEF-NEXT: v_add_f32_e32 v0, v0, v1
-; DEF-NEXT: s_buffer_load_b32 s4, s[0:3], 0x0
-; DEF-NEXT: s_and_b32 s1, s3, s1
+; DEF-NEXT: s_mov_b32 s8, exec_lo
; DEF-NEXT: s_wait_kmcnt 0x0
; DEF-NEXT: s_wait_alu 0xfffe
-; DEF-NEXT: s_lshl_b32 s1, s4, s1
+; DEF-NEXT: s_lshl_b32 s1, s3, s1
; DEF-NEXT: s_wait_alu 0xfffe
; DEF-NEXT: v_cmp_ne_u32_e32 vcc_lo, s1, v1
; DEF-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
; DEF-NEXT: s_wait_alu 0xfffe
-; DEF-NEXT: s_and_not1_b32 s2, s2, s1
+; DEF-NEXT: s_and_not1_b32 s8, s8, s1
; DEF-NEXT: s_cbranch_scc0 .LBB0_6
; DEF-NEXT: ; %bb.2: ; %false
-; DEF-NEXT: s_wait_alu 0xfffe
-; DEF-NEXT: s_and_b32 exec_lo, exec_lo, s2
+; DEF-NEXT: s_and_b32 exec_lo, exec_lo, s8
; DEF-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; DEF-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; DEF-NEXT: s_cbranch_vccnz .LBB0_4
@@ -70,19 +73,23 @@ define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float
; V1-NEXT: s_mov_b32 s0, 0
; V1-NEXT: s_cbranch_scc0 .LBB0_5
; V1-NEXT: ; %bb.1: ; %false
-; V1-NEXT: s_mov_b32 s2, exec_lo
+; V1-NEXT: s_mov_b32 s7, s5
+; V1-NEXT: s_mov_b32 s6, s4
+; V1-NEXT: s_mov_b32 s5, s3
+; V1-NEXT: s_mov_b32 s4, s2
+; V1-NEXT: s_and_b32 s2, s104, s103
+; V1-NEXT: s_buffer_load_b32 s1, s[4:7], 0x0
; V1-NEXT: v_add_f32_e32 v0, v0, v1
-; V1-NEXT: s_buffer_load_b32 s1, s[0:3], 0x0
-; V1-NEXT: s_and_b32 s3, s104, s103
+; V1-NEXT: s_mov_b32 s8, exec_lo
; V1-NEXT: s_wait_kmcnt 0x0
-; V1-NEXT: s_lshl_b32 vcc_hi, s1, s3
+; V1-NEXT: s_lshl_b32 vcc_hi, s1, s2
; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; V1-NEXT: v_cmp_ne_u32_e32 vcc_lo, vcc_hi, v1
; V1-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
-; V1-NEXT: s_and_not1_b32 s2, s2, s1
+; V1-NEXT: s_and_not1_b32 s8, s8, s1
; V1-NEXT: s_cbranch_scc0 .LBB0_6
; V1-NEXT: ; %bb.2: ; %false
-; V1-NEXT: s_and_b32 exec_lo, exec_lo, s2
+; V1-NEXT: s_and_b32 exec_lo, exec_lo, s8
; V1-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; V1-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; V1-NEXT: s_wait_alu 0xfffe
@@ -102,10 +109,10 @@ define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float
;
; V2-LABEL: fadd_f32:
; V2: ; %bb.0: ; %entry
-; V2-NEXT: s_add_f32 s66, s0, s1
-; V2-NEXT: s_sub_f32 s65, s0, s1
+; V2-NEXT: s_add_f32 s62, s0, s1
+; V2-NEXT: s_sub_f32 s61, s0, s1
; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; V2-NEXT: v_dual_add_f32 v0, s66, v0 :: v_dual_add_f32 v1, s65, v1
+; V2-NEXT: v_dual_add_f32 v0, s62, v0 :: v_dual_add_f32 v1, s61, v1
; V2-NEXT: v_readfirstlane_b32 s1, v0
; V2-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; V2-NEXT: v_readfirstlane_b32 vcc_lo, v1
@@ -116,21 +123,23 @@ define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float
; V2-NEXT: s_mov_b32 s1, 0
; V2-NEXT: s_cbranch_scc0 .LBB0_5
; V2-NEXT: ; %bb.1: ; %false
-; V2-NEXT: s_buffer_load_b32 vcc_lo, s[0:3], 0x0
-; V2-NEXT: s_and_b32 s66, s66, s65
+; V2-NEXT: s_mov_b32 s55, s5
+; V2-NEXT: s_mov_b32 s54, s4
+; V2-NEXT: s_mov_b32 s53, s3
+; V2-NEXT: s_mov_b32 s52, s2
; V2-NEXT: v_add_f32_e32 v0, v0, v1
-; V2-NEXT: s_mov_b32 s79, exec_lo
+; V2-NEXT: s_buffer_load_b32 vcc_lo, s[52:55], 0x0
+; V2-NEXT: s_and_b32 s54, s62, s61
+; V2-NEXT: s_mov_b32 s69, exec_lo
; V2-NEXT: s_wait_kmcnt 0x0
-; V2-NEXT: s_wait_alu 0xfffe
-; V2-NEXT: s_lshl_b32 s67, vcc_lo, s66
-; V2-NEXT: s_wait_alu 0xfffe
+; V2-NEXT: s_lshl_b32 s67, vcc_lo, s54
+; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; V2-NEXT: v_cmp_ne_u32_e32 vcc_lo, s67, v1
; V2-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo
-; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; V2-NEXT: s_and_not1_b32 s79, s79, vcc_lo
+; V2-NEXT: s_and_not1_b32 s69, s69, vcc_lo
; V2-NEXT: s_cbranch_scc0 .LBB0_6
; V2-NEXT: ; %bb.2: ; %false
-; V2-NEXT: s_and_b32 exec_lo, exec_lo, s79
+; V2-NEXT: s_and_b32 exec_lo, exec_lo, s69
; V2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; V2-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1
; V2-NEXT: s_cbranch_vccnz .LBB0_4
@@ -163,20 +172,24 @@ define amdgpu_ps float @fadd_f32(float inreg %a, float inreg %b, float %c, float
; V3-NEXT: s_mov_b32 s0, 0
; V3-NEXT: s_cbranch_scc0 .LBB0_5
; V3-NEXT: ; %bb.1: ; %false
-; V3-NEXT: s_mov_b32 s2, exec_lo
-; V3-NEXT: s_and_b32 s104, s104, s82
-; V3-NEXT: s_buffer_load_b32 s1, s[0:3], 0x0
+; V3-NEXT: s_mov_b32 s7, s5
+; V3-NEXT: s_mov_b32 s6, s4
+; V3-NEXT: s_mov_b32 s5, s3
+; V3-NEXT: s_mov_b32 s4, s2
; V3-NEXT: v_add_f32_e32 v0, v0, v1
+; V3-NEXT: s_buffer_load_b32 s1, s[4:7], 0x0
+; V3-NEXT: s_and_b32 s4, s104, s82
+; V3-NEXT: s_mov_b32 s8, exec_lo
; V3-NEXT: s_wait_kmcnt 0x0
-; V3-NEXT: s_lshl_b32 s82, s1, s104
+; V3-NEXT: s_lshl_b32 s82, s1, s4
; V3-NEXT: s_wait_alu 0xfffe
; V3-NEXT: v_cmp_ne_u32_e32 vcc_lo, s82, v1
; V3-NEXT: s_and_not1_b32 s1, exec_lo, vcc_lo
; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; V3-NEXT: s_and_not1_b32 s2, s2, s1
+; V3-NEXT: s_and_not1_b32 s8, s8, s1
; V3-NEXT: s_cbranch_scc0 .LBB0_6
; V3-NEXT: ; %bb.2: ; %false
-; V3-NEXT: s_and_b32 exec_lo, exec_lo, s2
+; V3-NEXT: s_and_b32 exec_lo, exec_lo, s8
; V3-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; V3-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; V3-NEXT: s_cbranch_vccnz .LBB0_4
@@ -214,7 +227,7 @@ true:
br label %final
false:
%v.4 = fadd float %v.0, %v.1
- %s.7 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0)
+ %s.7 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
%s.8 = shl i32 %s.7, %s.6
%c.1 = icmp ne i32 %tmp.1, %s.8
call void @llvm.amdgcn.wqm.demote(i1 %c.1)
>From c1d427846e6a8872b02088bed7ff0033ba4c9dac Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 6 Mar 2025 10:36:20 +0900
Subject: [PATCH 3/4] - Remove else after return
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d210d701b9306..f96b75690b223 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3912,12 +3912,10 @@ SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
unsigned SIRegisterInfo::getSGPRHazardAvoidanceStrategy(
const MachineFunction &MF) const {
- if (SGPRHazardAvoidanceStrategy.getNumOccurrences()) {
+ if (SGPRHazardAvoidanceStrategy.getNumOccurrences())
return SGPRHazardAvoidanceStrategy;
- } else {
- return MF.getFunction().getFnAttributeAsParsedInteger(
- "amdgpu-sgpr-hazard-regalloc", 0);
- }
+ return MF.getFunction().getFnAttributeAsParsedInteger(
+ "amdgpu-sgpr-hazard-regalloc", 0);
}
bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
>From e8a461eb73b12347b4572ddeb40303ad36a8377b Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 6 Mar 2025 10:40:52 +0900
Subject: [PATCH 4/4] - clang-format
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f96b75690b223..0567a1b4722db 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3915,7 +3915,7 @@ unsigned SIRegisterInfo::getSGPRHazardAvoidanceStrategy(
if (SGPRHazardAvoidanceStrategy.getNumOccurrences())
return SGPRHazardAvoidanceStrategy;
return MF.getFunction().getFnAttributeAsParsedInteger(
- "amdgpu-sgpr-hazard-regalloc", 0);
+ "amdgpu-sgpr-hazard-regalloc", 0);
}
bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
More information about the llvm-commits
mailing list