[llvm] 3c42a58 - [AMDGPU] Add Lower Bound to PipelineSolver
Jeff Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 5 14:55:20 PDT 2023
Author: Jeff Byrnes
Date: 2023-04-05T14:54:59-07:00
New Revision: 3c42a58c4f20ae3b621733bf5ee6d57c912994a9
URL: https://github.com/llvm/llvm-project/commit/3c42a58c4f20ae3b621733bf5ee6d57c912994a9
DIFF: https://github.com/llvm/llvm-project/commit/3c42a58c4f20ae3b621733bf5ee6d57c912994a9.diff
LOG: [AMDGPU] Add Lower Bound to PipelineSolver
Added:
llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index fc0df61952e48..35dddf85596d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -61,6 +61,15 @@ static cl::opt<bool> UseCostHeur(
"Experimentally, results are mixed, so this should be set on a "
"case-by-case basis."));
+static cl::opt<bool> EnableLowerBound(
+ "amdgpu-igrouplp-exact-solver-lower-bound", cl::Hidden,
+ cl::desc("Whether to use a lower bound when calculating the cost "
+ "for a partial fit using the exact solver. The lower bound "
+ "calculates the cost of assigning the remaining instructions "
+ "under idealized conditions. The LB reduces the overall search "
+ "space but adds time complexity per branch explored."),
+ cl::init(false));
+
// Components of the mask that determines which instruction types may be may be
// classified into a SchedGroup.
enum class SchedGroupMask {
@@ -109,7 +118,11 @@ class SchedGroup {
const SIInstrInfo *TII;
- // Try to add and edge from SU A to SU B.
+ // Try to add and edge from SU A to SU B. This returns false if there is a
+ // dependency which makes adding the A->B edge impossible, otherwise it
+ // returns true. The result is that it will return true even if no edge was
+ // added. For example, if there is already an edge between A->B, this will
+ // return true, even though DAG->addEdge does not add edge.
bool tryAddEdge(SUnit *A, SUnit *B);
// Use SGMask to determine whether we can classify MI as a member of this
@@ -131,7 +144,7 @@ class SchedGroup {
// Add DAG dependencies and track which edges are added, and the count of
// missed edges
int link(SUnit &SU, bool MakePred,
- std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
+ SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges);
// Add DAG dependencies from all SUnits in this SchedGroup and this SU.
// Use the predicate to determine whether SU should be a predecessor (P =
@@ -243,6 +256,9 @@ class PipelineSolver {
int BestCost = -1;
int CurrCost = 0;
+ // A lower bound on the optimal cost for a complete pipeline
+ int StaticLowerBound = 0;
+
// Index pointing to the conflicting instruction that is currently being
// fitted
int CurrConflInstNo = 0;
@@ -270,14 +286,19 @@ class PipelineSolver {
void populateReadyList(SUToCandSGsPair &CurrSU,
SmallVectorImpl<std::pair<int, int>> &ReadyList,
SmallVectorImpl<SchedGroup> &SyncPipeline);
+ // Calculate best cost assignment of an unassigned SU without assigning it.
+ // The sum of these costs across SUs represents a Lower Bound on the true best
+ // cost for the set of unassigned SUs.
+ int calculateLowerBound();
// Add edges corresponding to the SchedGroups as assigned by solver
void makePipeline();
// Add the edges from the SU to the other SchedGroups in pipeline, and
// return the number of edges missed.
int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
- std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
+ SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges,
+ int BestCost = -1);
// Remove the edges passed via AddedEdges
- void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
+ void removeEdges(SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges);
// Convert the passed in maps to arrays for bidirectional iterators
void convertSyncMapsToArrays();
@@ -395,7 +416,7 @@ void PipelineSolver::makePipeline() {
int PipelineSolver::addEdges(
SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
- std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
+ SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges, int BestCost) {
int AddedCost = 0;
bool MakePred = false;
@@ -406,6 +427,8 @@ int PipelineSolver::addEdges(
// linked as a predecessor of the subsequent SchedGroups
auto GroupNo = (int)SyncPipeline.size() - 1;
for (; GroupNo >= 0; GroupNo--) {
+ if (BestCost != -1 && AddedCost >= BestCost)
+ return AddedCost;
if (SyncPipeline[GroupNo].getSGID() == SGID) {
MakePred = true;
continue;
@@ -419,15 +442,18 @@ int PipelineSolver::addEdges(
}
void PipelineSolver::removeEdges(
- const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
+ SmallVectorImpl<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
// Only remove the edges that we have added when testing
// the fit.
for (auto &PredSuccPair : EdgesToRemove) {
SUnit *Pred = PredSuccPair.first;
SUnit *Succ = PredSuccPair.second;
- auto Match = llvm::find_if(
- Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
+ auto Match =
+ std::find_if(Succ->Preds.begin(), Succ->Preds.end(), [&Pred](SDep &P) {
+ return P.getSUnit() == Pred && P.isArtificial();
+ });
+
if (Match != Succ->Preds.end()) {
assert(Match->isArtificial());
Succ->removePred(*Match);
@@ -478,7 +504,7 @@ bool PipelineSolver::checkOptimal() {
if (BestCost == -1 || CurrCost < BestCost) {
BestPipeline = CurrPipeline;
BestCost = CurrCost;
- LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
+ LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << '\n');
}
assert(BestCost >= 0);
}
@@ -487,7 +513,7 @@ bool PipelineSolver::checkOptimal() {
if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
DoneExploring = true;
- return (DoneExploring || BestCost == 0);
+ return (DoneExploring || BestCost == StaticLowerBound);
}
void PipelineSolver::populateReadyList(
@@ -496,8 +522,9 @@ void PipelineSolver::populateReadyList(
assert(CurrSU.second.size() >= 1);
auto I = CurrSU.second.rbegin();
auto E = CurrSU.second.rend();
+ SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
for (; I != E; ++I) {
- std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
+
int CandSGID = *I;
SchedGroup *Match;
for (auto &SG : SyncPipeline) {
@@ -510,6 +537,7 @@ void PipelineSolver::populateReadyList(
ReadyList.push_back(std::pair(*I, MissPenalty));
continue;
}
+ AddedEdges.clear();
int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
ReadyList.push_back(std::pair(*I, TempCost));
@@ -528,6 +556,52 @@ void PipelineSolver::populateReadyList(
assert(ReadyList.size() == CurrSU.second.size());
}
+int PipelineSolver::calculateLowerBound() {
+ if (CurrSyncGroupIdx >= (int)CurrPipeline.size())
+ return 0;
+ int TempConflInstNo = CurrConflInstNo;
+ int TmpSyncGroupIdx = CurrSyncGroupIdx;
+ int MinimumCost = 0;
+ SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
+
+ for (; TmpSyncGroupIdx < (int)CurrPipeline.size(); TmpSyncGroupIdx++) {
+ auto SyncPipeline = CurrPipeline[TmpSyncGroupIdx];
+ for (; TempConflInstNo < (int)PipelineInstrs[TmpSyncGroupIdx].size();
+ TempConflInstNo++) {
+ auto CurrSU = PipelineInstrs[TmpSyncGroupIdx][TempConflInstNo];
+ auto I = CurrSU.second.rbegin();
+ auto E = CurrSU.second.rend();
+ int MinCostForSU = -1;
+ for (; I != E; I++) {
+ int CandSGID = *I;
+ SchedGroup *Match;
+ for (auto &SG : SyncPipeline) {
+ if (SG.getSGID() == CandSGID)
+ Match = &SG;
+ }
+
+ if (Match->isFull()) {
+ if (MinCostForSU == -1 || MissPenalty < MinCostForSU)
+ MinCostForSU = MissPenalty;
+ continue;
+ }
+ AddedEdges.clear();
+ int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID,
+ AddedEdges, MinCostForSU);
+ if (MinCostForSU == -1 || TempCost < MinCostForSU)
+ MinCostForSU = TempCost;
+
+ removeEdges(AddedEdges);
+ if (MinCostForSU == 0)
+ break;
+ }
+ MinimumCost += MinCostForSU;
+ }
+ TempConflInstNo = 0;
+ }
+ return MinimumCost;
+}
+
bool PipelineSolver::solveExact() {
if (checkOptimal())
return true;
@@ -540,12 +614,13 @@ bool PipelineSolver::solveExact() {
PipelineInstrs[CurrSyncGroupIdx].size());
SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
- << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
+ << ") in Pipeline # " << CurrSyncGroupIdx << '\n');
// SchedGroup -> Cost pairs
SmallVector<std::pair<int, int>, 4> ReadyList;
// Prioritize the candidate sched groups in terms of lowest cost first
populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
+ SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
auto I = ReadyList.begin();
auto E = ReadyList.end();
@@ -558,7 +633,6 @@ bool PipelineSolver::solveExact() {
int CandSGID = I->first;
int AddedCost = 0;
- std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
SchedGroup *Match;
for (auto &SG : SyncPipeline) {
@@ -571,19 +645,22 @@ bool PipelineSolver::solveExact() {
LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " << CandSGID
- << "\n");
+ << '\n');
Match->add(*CurrSU.first);
+ AddedEdges.clear();
AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
- LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
+ LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << '\n');
CurrCost += AddedCost;
advancePosition();
++BranchesExplored;
bool FinishedExploring = false;
// If the Cost after adding edges is greater than a known solution,
// backtrack
- if (CurrCost < BestCost || BestCost == -1) {
+ int LBCost =
+ (EnableLowerBound && BestCost != -1) ? calculateLowerBound() : 0;
+ if (BestCost == -1 || CurrCost + LBCost < BestCost) {
if (solveExact()) {
- FinishedExploring = BestCost != 0;
+ FinishedExploring = BestCost != StaticLowerBound;
if (!FinishedExploring)
return true;
}
@@ -609,7 +686,7 @@ bool PipelineSolver::solveExact() {
bool FinishedExploring = false;
if (CurrCost < BestCost || BestCost == -1) {
if (solveExact()) {
- bool FinishedExploring = BestCost != 0;
+ bool FinishedExploring = BestCost != StaticLowerBound;
if (!FinishedExploring)
return true;
}
@@ -622,7 +699,7 @@ bool PipelineSolver::solveExact() {
bool PipelineSolver::solveGreedy() {
BestCost = 0;
- std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
+ SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
@@ -632,7 +709,7 @@ bool PipelineSolver::solveGreedy() {
int BestGroupID = -1;
auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
- << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
+ << ") in Pipeline # " << CurrSyncGroupIdx << '\n');
// Since we have added the potential SchedGroups from bottom up, but
// traversed the DAG from top down, parse over the groups from last to
@@ -641,7 +718,7 @@ bool PipelineSolver::solveGreedy() {
auto I = CurrSU.second.rbegin();
auto E = CurrSU.second.rend();
for (; I != E; ++I) {
- std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
+ SmallVector<std::pair<SUnit *, SUnit *>, 16> AddedEdges;
int CandSGID = *I;
SchedGroup *Match;
for (auto &SG : SyncPipeline) {
@@ -650,14 +727,15 @@ bool PipelineSolver::solveGreedy() {
}
LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
- << (int)Match->getMask() << "\n");
+ << (int)Match->getMask() << '\n');
if (Match->isFull()) {
LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
continue;
}
- TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
- LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
+ TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges,
+ BestNodeCost);
+ LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << '\n');
if (TempCost < BestNodeCost || BestNodeCost == -1) {
BestGroup = Match;
BestNodeCost = TempCost;
@@ -672,7 +750,7 @@ bool PipelineSolver::solveGreedy() {
BestGroup->add(*CurrSU.first);
addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
- << (int)BestGroup->getMask() << "\n");
+ << (int)BestGroup->getMask() << '\n');
BestCost += TempCost;
} else
BestCost += MissPenalty;
@@ -709,11 +787,14 @@ void PipelineSolver::solve() {
LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
solveGreedy();
reset();
- LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
- if (BestCost > 0) {
+ LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << '\n');
+ StaticLowerBound = calculateLowerBound();
+ LLVM_DEBUG(dbgs() << "Lower Bound on Pipeline Cost is " << StaticLowerBound
+ << '\n');
+ if (BestCost > StaticLowerBound) {
LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
solveExact();
- LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
+ LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << '\n');
}
} else { // Use the Greedy Algorithm by default
LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
@@ -897,7 +978,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
}
int SchedGroup::link(SUnit &SU, bool MakePred,
- std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
+ SmallVectorImpl<std::pair<SUnit *, SUnit *>> &AddedEdges) {
int MissedEdges = 0;
for (auto *A : Collection) {
SUnit *B = &SU;
@@ -906,10 +987,6 @@ int SchedGroup::link(SUnit &SU, bool MakePred,
if (MakePred)
std::swap(A, B);
- if (DAG->IsReachable(B, A))
- continue;
- // tryAddEdge returns false if there is a dependency that makes adding
- // the A->B edge impossible, otherwise it returns true;
bool Added = tryAddEdge(A, B);
if (Added)
AddedEdges.push_back(std::pair(A, B));
diff --git a/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll b/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll
new file mode 100644
index 0000000000000..bcdf2abc4ddc0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll
@@ -0,0 +1,241 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACT %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver=1 -amdgpu-igrouplp-exact-solver-max-branches=200000 -amdgpu-igrouplp-exact-solver-cost-heur=1 < %s | FileCheck -check-prefix=LB %s
+
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 {
+; EXACT-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
+; EXACT: ; %bb.0:
+; EXACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; EXACT-NEXT: v_lshlrev_b32_e32 v16, 7, v0
+; EXACT-NEXT: ; kill: killed $sgpr0_sgpr1
+; EXACT-NEXT: s_waitcnt lgkmcnt(0)
+; EXACT-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACT-NEXT: s_waitcnt vmcnt(1)
+; EXACT-NEXT: v_mul_lo_u32 v13, v13, v13
+; EXACT-NEXT: s_waitcnt vmcnt(0)
+; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7
+; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6
+; EXACT-NEXT: v_mul_lo_u32 v12, v12, v12
+; EXACT-NEXT: v_mul_lo_u32 v15, v15, v15
+; EXACT-NEXT: v_mul_lo_u32 v14, v14, v14
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACT-NEXT: s_waitcnt vmcnt(0)
+; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
+; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
+; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
+; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
+; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
+; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; EXACT-NEXT: s_waitcnt vmcnt(0)
+; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
+; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
+; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
+; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
+; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
+; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
+; EXACT-NEXT: s_waitcnt vmcnt(0)
+; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3
+; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2
+; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1
+; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0
+; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
+; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5
+; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4
+; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
+; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACT-NEXT: s_waitcnt vmcnt(0)
+; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7
+; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6
+; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5
+; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4
+; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64
+; EXACT-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
+; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACT-NEXT: s_waitcnt vmcnt(0)
+; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9
+; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8
+; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11
+; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10
+; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
+; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACT-NEXT: s_waitcnt vmcnt(0)
+; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11
+; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10
+; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9
+; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8
+; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACT-NEXT: s_endpgm
+;
+; LB-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE:
+; LB: ; %bb.0:
+; LB-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; LB-NEXT: v_lshlrev_b32_e32 v12, 7, v0
+; LB-NEXT: s_waitcnt lgkmcnt(0)
+; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:64
+; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; LB-NEXT: s_waitcnt vmcnt(0)
+; LB-NEXT: v_mul_lo_u32 v11, v11, v11
+; LB-NEXT: v_mul_lo_u32 v10, v10, v10
+; LB-NEXT: v_mul_lo_u32 v9, v9, v9
+; LB-NEXT: v_mul_lo_u32 v8, v8, v8
+; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:64
+; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1]
+; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; LB-NEXT: s_waitcnt vmcnt(0)
+; LB-NEXT: v_mul_lo_u32 v3, v3, v3
+; LB-NEXT: v_mul_lo_u32 v2, v2, v2
+; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:32
+; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; LB-NEXT: s_waitcnt vmcnt(0)
+; LB-NEXT: v_mul_lo_u32 v9, v9, v9
+; LB-NEXT: v_mul_lo_u32 v8, v8, v8
+; LB-NEXT: v_mul_lo_u32 v11, v11, v11
+; LB-NEXT: v_mul_lo_u32 v10, v10, v10
+; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:32
+; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:112
+; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; LB-NEXT: s_waitcnt vmcnt(0)
+; LB-NEXT: v_mul_lo_u32 v7, v7, v7
+; LB-NEXT: v_mul_lo_u32 v6, v6, v6
+; LB-NEXT: v_mul_lo_u32 v1, v1, v1
+; LB-NEXT: v_mul_lo_u32 v0, v0, v0
+; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3]
+; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:96
+; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; LB-NEXT: s_waitcnt vmcnt(0)
+; LB-NEXT: v_mul_lo_u32 v3, v3, v3
+; LB-NEXT: v_mul_lo_u32 v2, v2, v2
+; LB-NEXT: v_mul_lo_u32 v1, v1, v1
+; LB-NEXT: v_mul_lo_u32 v0, v0, v0
+; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:96
+; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:80
+; LB-NEXT: s_waitcnt vmcnt(0)
+; LB-NEXT: v_mul_lo_u32 v3, v3, v3
+; LB-NEXT: v_mul_lo_u32 v2, v2, v2
+; LB-NEXT: v_mul_lo_u32 v1, v1, v1
+; LB-NEXT: v_mul_lo_u32 v0, v0, v0
+; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:80
+; LB-NEXT: v_mul_lo_u32 v5, v5, v5
+; LB-NEXT: v_mul_lo_u32 v4, v4, v4
+; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:112
+; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:48
+; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; LB-NEXT: s_waitcnt vmcnt(0)
+; LB-NEXT: v_mul_lo_u32 v5, v5, v5
+; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:16
+; LB-NEXT: v_mul_lo_u32 v4, v4, v4
+; LB-NEXT: s_waitcnt vmcnt(0)
+; LB-NEXT: v_mul_lo_u32 v1, v1, v1
+; LB-NEXT: v_mul_lo_u32 v0, v0, v0
+; LB-NEXT: v_mul_lo_u32 v3, v3, v3
+; LB-NEXT: v_mul_lo_u32 v2, v2, v2
+; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:16
+; LB-NEXT: v_mul_lo_u32 v7, v7, v7
+; LB-NEXT: v_mul_lo_u32 v6, v6, v6
+; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:48
+; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; LB-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid
+ %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1
+ %mul = mul <32 x i32> %load, %load
+ %gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid
+ store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2
+ ; 1 VMEM read
+ call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
+ ; 2 VALU
+ call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
+ ; 1 VMEM write
+ call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
+ ; 1 VMEM read
+ call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
+ ; 2 VALU
+ call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
+ ; 1 VMEM write
+ call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
+ ; 1 VMEM read
+ call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
+ ; 2 VALU
+ call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
+ ; 1 VMEM write
+ call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
+ ; 1 VMEM read
+ call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
+ ; 2 VALU
+ call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
+ ; 1 VMEM write
+ call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
+ ; 1 VMEM read
+ call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
+ ; 2 VALU
+ call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
+ ; 1 VMEM write
+ call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
+ ; 1 VMEM read
+ call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
+ ; 2 VALU
+ call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
+ ; 1 VMEM write
+ call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
+ ; 1 VMEM read
+ call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
+ ; 2 VALU
+ call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
+ ; 1 VMEM write
+ call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
+ ; 1 VMEM read
+ call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0)
+ ; 2 VALU
+ call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0)
+ ; 1 VMEM write
+ call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0)
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #0
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #0
+
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" readnone speculatable}
More information about the llvm-commits
mailing list