[llvm] [AMDGPU][SIPreEmitPeephole] pre-commit tests: mustRetainExeczBranch: use a cost model (PR #109816)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 24 08:22:59 PDT 2024
https://github.com/jmmartinez created https://github.com/llvm/llvm-project/pull/109816
None
>From bd9a3b0f6e5eb2ea6bc1d9f72d7aef0b679703b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 23 Sep 2024 16:18:54 +0200
Subject: [PATCH 1/3] [AMDGPU][StructurizeCFG] Pre-Commit tests: Maintain
branch MD_prof metadata
---
.../AMDGPU/structurizer-keep-perf-md.ll | 72 +++++++++++++++++++
1 file changed, 72 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll
diff --git a/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll b/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll
new file mode 100644
index 00000000000000..408678b98cc1da
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -S -structurizecfg %s | FileCheck -check-prefix=OPT %s
+
+define amdgpu_ps i32 @if_else(i32 %0) {
+; OPT-LABEL: define amdgpu_ps i32 @if_else(
+; OPT-SAME: i32 [[TMP0:%.*]]) {
+; OPT-NEXT: [[C:%.*]] = icmp ne i32 [[TMP0]], 0
+; OPT-NEXT: br i1 [[C]], label %[[FALSE:.*]], label %[[FLOW:.*]]
+; OPT: [[FLOW]]:
+; OPT-NEXT: [[TMP2:%.*]] = phi i32 [ 33, %[[FALSE]] ], [ undef, [[TMP1:%.*]] ]
+; OPT-NEXT: [[TMP3:%.*]] = phi i1 [ false, %[[FALSE]] ], [ true, [[TMP1]] ]
+; OPT-NEXT: br i1 [[TMP3]], label %[[TRUE:.*]], label %[[EXIT:.*]]
+; OPT: [[TRUE]]:
+; OPT-NEXT: br label %[[EXIT]]
+; OPT: [[FALSE]]:
+; OPT-NEXT: br label %[[FLOW]]
+; OPT: [[EXIT]]:
+; OPT-NEXT: [[RET:%.*]] = phi i32 [ [[TMP2]], %[[FLOW]] ], [ 42, %[[TRUE]] ]
+; OPT-NEXT: ret i32 [[RET]]
+;
+ %c = icmp eq i32 %0, 0
+ br i1 %c, label %true, label %false, !prof !0
+
+true: ; preds = %1
+ br label %exit
+
+false: ; preds = %1
+ br label %exit
+
+exit: ; preds = %false, %true
+ %ret = phi i32 [ 42, %true ], [ 33, %false ]
+ ret i32 %ret
+}
+
+define amdgpu_ps void @loop_if_break(i32 %n) {
+; OPT-LABEL: define amdgpu_ps void @loop_if_break(
+; OPT-SAME: i32 [[N:%.*]]) {
+; OPT-NEXT: [[ENTRY:.*]]:
+; OPT-NEXT: br label %[[LOOP:.*]]
+; OPT: [[LOOP]]:
+; OPT-NEXT: [[I:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[TMP0:%.*]], %[[FLOW:.*]] ]
+; OPT-NEXT: [[C:%.*]] = icmp ugt i32 [[I]], 0
+; OPT-NEXT: br i1 [[C]], label %[[LOOP_BODY:.*]], label %[[FLOW]]
+; OPT: [[LOOP_BODY]]:
+; OPT-NEXT: [[I_NEXT:%.*]] = sub i32 [[I]], 1
+; OPT-NEXT: br label %[[FLOW]]
+; OPT: [[FLOW]]:
+; OPT-NEXT: [[TMP0]] = phi i32 [ [[I_NEXT]], %[[LOOP_BODY]] ], [ undef, %[[LOOP]] ]
+; OPT-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[LOOP_BODY]] ], [ true, %[[LOOP]] ]
+; OPT-NEXT: br i1 [[TMP1]], label %[[EXIT:.*]], label %[[LOOP]]
+; OPT: [[EXIT]]:
+; OPT-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop_body, %entry
+ %i = phi i32 [ %n, %entry ], [ %i.next, %loop_body ]
+ %c = icmp ugt i32 %i, 0
+ br i1 %c, label %loop_body, label %exit, !prof !0
+
+loop_body: ; preds = %loop
+ %i.next = sub i32 %i, 1
+ br label %loop
+
+exit: ; preds = %loop
+ ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!0 = !{!"branch_weights", i32 1000, i32 1}
>From e903c5a5d399bcce2629d18dd5077a0fc5bfec01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Mon, 23 Sep 2024 16:38:37 +0200
Subject: [PATCH 2/3] [AMDGPU][StructurizeCFG] Maintain branch MD_prof metadata
---
llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 87 +++++++++++++++----
.../AMDGPU/structurizer-keep-perf-md.ll | 8 +-
2 files changed, 75 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index aca8225cebb3fd..563ce402fd44fe 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -30,6 +30,7 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
@@ -47,6 +48,7 @@
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <algorithm>
#include <cassert>
+#include <optional>
#include <utility>
using namespace llvm;
@@ -85,7 +87,46 @@ using PhiMap = MapVector<PHINode *, BBValueVector>;
using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
-using BBPredicates = DenseMap<BasicBlock *, Value *>;
+
+using MaybeCondBranchWeights = std::optional<class CondBranchWeights>;
+
+class CondBranchWeights {
+ uint32_t TrueWeight;
+ uint32_t FalseWeight;
+
+public:
+ CondBranchWeights(unsigned T, unsigned F) : TrueWeight(T), FalseWeight(F) {}
+
+ static MaybeCondBranchWeights tryParse(const BranchInst &Br) {
+ assert(Br.isConditional());
+
+ SmallVector<uint32_t, 2> Weights;
+ if (!extractBranchWeights(Br, Weights))
+ return std::nullopt;
+
+ if (Weights.size() != 2)
+ return std::nullopt;
+
+ return CondBranchWeights{Weights[0], Weights[1]};
+ }
+
+ static void setMetadata(BranchInst &Br,
+ MaybeCondBranchWeights const &Weights) {
+ assert(Br.isConditional());
+ if (!Weights)
+ return;
+ uint32_t Arr[] = {Weights->TrueWeight, Weights->FalseWeight};
+ setBranchWeights(Br, Arr, false);
+ }
+
+ CondBranchWeights invert() const {
+ return CondBranchWeights{FalseWeight, TrueWeight};
+ }
+};
+
+using ValueWeightPair = std::pair<Value *, MaybeCondBranchWeights>;
+
+using BBPredicates = DenseMap<BasicBlock *, ValueWeightPair>;
using PredMap = DenseMap<BasicBlock *, BBPredicates>;
using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
@@ -271,7 +312,7 @@ class StructurizeCFG {
void analyzeLoops(RegionNode *N);
- Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+ ValueWeightPair buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
void gatherPredicates(RegionNode *N);
@@ -449,16 +490,22 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
}
/// Build the condition for one edge
-Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
- bool Invert) {
+ValueWeightPair StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+ bool Invert) {
Value *Cond = Invert ? BoolFalse : BoolTrue;
+ MaybeCondBranchWeights Weights = std::nullopt;
+
if (Term->isConditional()) {
Cond = Term->getCondition();
+ Weights = CondBranchWeights::tryParse(*Term);
- if (Idx != (unsigned)Invert)
+ if (Idx != (unsigned)Invert) {
Cond = invertCondition(Cond);
+ if (Weights)
+ Weights = Weights->invert();
+ }
}
- return Cond;
+ return {Cond, Weights};
}
/// Analyze the predecessors of each block and build up predicates
@@ -490,8 +537,8 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
if (Visited.count(Other) && !Loops.count(Other) &&
!Pred.count(Other) && !Pred.count(P)) {
- Pred[Other] = BoolFalse;
- Pred[P] = BoolTrue;
+ Pred[Other] = {BoolFalse, std::nullopt};
+ Pred[P] = {BoolTrue, std::nullopt};
continue;
}
}
@@ -512,9 +559,9 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
BasicBlock *Entry = R->getEntry();
if (Visited.count(Entry))
- Pred[Entry] = BoolTrue;
+ Pred[Entry] = {BoolTrue, std::nullopt};
else
- LPred[Entry] = BoolFalse;
+ LPred[Entry] = {BoolFalse, std::nullopt};
}
}
}
@@ -578,12 +625,14 @@ void StructurizeCFG::insertConditions(bool Loops) {
Dominator.addBlock(Parent);
Value *ParentValue = nullptr;
- for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
+ MaybeCondBranchWeights ParentWeights = std::nullopt;
+ for (std::pair<BasicBlock *, ValueWeightPair> BBAndPred : Preds) {
BasicBlock *BB = BBAndPred.first;
- Value *Pred = BBAndPred.second;
+ Value *Pred = BBAndPred.second.first;
if (BB == Parent) {
ParentValue = Pred;
+ ParentWeights = BBAndPred.second.second;
break;
}
PhiInserter.AddAvailableValue(BB, Pred);
@@ -592,6 +641,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
if (ParentValue) {
Term->setCondition(ParentValue);
+ CondBranchWeights::setMetadata(*Term, ParentWeights);
} else {
if (!Dominator.resultIsRememberedBlock())
PhiInserter.AddAvailableValue(Dominator.result(), Default);
@@ -607,7 +657,7 @@ void StructurizeCFG::simplifyConditions() {
for (auto &I : concat<PredMap::value_type>(Predicates, LoopPreds)) {
auto &Preds = I.second;
for (auto &J : Preds) {
- auto &Cond = J.second;
+ auto &Cond = J.second.first;
Instruction *Inverted;
if (match(Cond, m_Not(m_OneUse(m_Instruction(Inverted)))) &&
!Cond->use_empty()) {
@@ -904,9 +954,10 @@ void StructurizeCFG::setPrevNode(BasicBlock *BB) {
/// Does BB dominate all the predicates of Node?
bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
BBPredicates &Preds = Predicates[Node->getEntry()];
- return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
- return DT->dominates(BB, Pred.first);
- });
+ return llvm::all_of(Preds,
+ [&](std::pair<BasicBlock *, ValueWeightPair> Pred) {
+ return DT->dominates(BB, Pred.first);
+ });
}
/// Can we predict that this node will always be called?
@@ -918,9 +969,9 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
if (!PrevNode)
return true;
- for (std::pair<BasicBlock*, Value*> Pred : Preds) {
+ for (std::pair<BasicBlock *, ValueWeightPair> Pred : Preds) {
BasicBlock *BB = Pred.first;
- Value *V = Pred.second;
+ Value *V = Pred.second.first;
if (V != BoolTrue)
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll b/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll
index 408678b98cc1da..d036d6cbca7b9d 100644
--- a/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll
+++ b/llvm/test/CodeGen/AMDGPU/structurizer-keep-perf-md.ll
@@ -5,7 +5,7 @@ define amdgpu_ps i32 @if_else(i32 %0) {
; OPT-LABEL: define amdgpu_ps i32 @if_else(
; OPT-SAME: i32 [[TMP0:%.*]]) {
; OPT-NEXT: [[C:%.*]] = icmp ne i32 [[TMP0]], 0
-; OPT-NEXT: br i1 [[C]], label %[[FALSE:.*]], label %[[FLOW:.*]]
+; OPT-NEXT: br i1 [[C]], label %[[FALSE:.*]], label %[[FLOW:.*]], !prof [[PROF0:![0-9]+]]
; OPT: [[FLOW]]:
; OPT-NEXT: [[TMP2:%.*]] = phi i32 [ 33, %[[FALSE]] ], [ undef, [[TMP1:%.*]] ]
; OPT-NEXT: [[TMP3:%.*]] = phi i1 [ false, %[[FALSE]] ], [ true, [[TMP1]] ]
@@ -40,7 +40,7 @@ define amdgpu_ps void @loop_if_break(i32 %n) {
; OPT: [[LOOP]]:
; OPT-NEXT: [[I:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[TMP0:%.*]], %[[FLOW:.*]] ]
; OPT-NEXT: [[C:%.*]] = icmp ugt i32 [[I]], 0
-; OPT-NEXT: br i1 [[C]], label %[[LOOP_BODY:.*]], label %[[FLOW]]
+; OPT-NEXT: br i1 [[C]], label %[[LOOP_BODY:.*]], label %[[FLOW]], !prof [[PROF1:![0-9]+]]
; OPT: [[LOOP_BODY]]:
; OPT-NEXT: [[I_NEXT:%.*]] = sub i32 [[I]], 1
; OPT-NEXT: br label %[[FLOW]]
@@ -70,3 +70,7 @@ exit: ; preds = %loop
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
!0 = !{!"branch_weights", i32 1000, i32 1}
+;.
+; OPT: [[PROF0]] = !{!"branch_weights", i32 1, i32 1000}
+; OPT: [[PROF1]] = !{!"branch_weights", i32 1000, i32 1}
+;.
>From 3422babb9ea423107052d348ea1bfb8f2bf61cad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Wed, 28 Aug 2024 17:30:54 +0200
Subject: [PATCH 3/3] [AMDGPU][SIPreEmitPeephole] pre-commit tests:
mustRetainExeczBranch: use a cost model
---
.../AMDGPU/amdgpu-demote-scc-branches.ll | 265 ++++++++++++++++++
1 file changed, 265 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
new file mode 100644
index 00000000000000..33865c04b3fe92
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX1010 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX1030 %s
+
+define void @convergent_cmp_no_metadata(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX10-LABEL: convergent_cmp_no_metadata:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_cmp_lt_i32 s21, 1
+; GFX10-NEXT: s_cbranch_scc1 .LBB0_2
+; GFX10-NEXT: ; %bb.1: ; %if.then
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_mov_b32_e32 v1, s19
+; GFX10-NEXT: s_mov_b32 s11, s18
+; GFX10-NEXT: s_mov_b32 s10, s17
+; GFX10-NEXT: s_mov_b32 s9, s16
+; GFX10-NEXT: s_mov_b32 s8, s7
+; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-NEXT: .LBB0_2: ; %if.end
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %cmp = icmp sgt i32 %flag, 0
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @convergent_cmp_unprofitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX10-LABEL: convergent_cmp_unprofitable:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_cmp_lt_i32 s21, 1
+; GFX10-NEXT: s_cbranch_scc1 .LBB1_2
+; GFX10-NEXT: ; %bb.1: ; %if.then
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_mov_b32_e32 v1, s19
+; GFX10-NEXT: s_mov_b32 s11, s18
+; GFX10-NEXT: s_mov_b32 s10, s17
+; GFX10-NEXT: s_mov_b32 s9, s16
+; GFX10-NEXT: s_mov_b32 s8, s7
+; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-NEXT: .LBB1_2: ; %if.end
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %cmp = icmp sgt i32 %flag, 0
+ br i1 %cmp, label %if.then, label %if.end, !prof !0
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @convergent_cmp_profitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX10-LABEL: convergent_cmp_profitable:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_cmp_lt_i32 s21, 1
+; GFX10-NEXT: s_cbranch_scc1 .LBB2_2
+; GFX10-NEXT: ; %bb.1: ; %if.then
+; GFX10-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-NEXT: v_mov_b32_e32 v1, s19
+; GFX10-NEXT: s_mov_b32 s11, s18
+; GFX10-NEXT: s_mov_b32 s10, s17
+; GFX10-NEXT: s_mov_b32 s9, s16
+; GFX10-NEXT: s_mov_b32 s8, s7
+; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-NEXT: .LBB2_2: ; %if.end
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %cmp = icmp sgt i32 %flag, 0
+ br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @divergent_cmp_no_metadata(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX1010-LABEL: divergent_cmp_no_metadata:
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1010-NEXT: v_cmp_gt_i32_e32 vcc_lo, s21, v0
+; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT: s_cbranch_execz .LBB3_2
+; GFX1010-NEXT: ; %bb.1: ; %if.then
+; GFX1010-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-NEXT: v_mov_b32_e32 v1, s19
+; GFX1010-NEXT: s_mov_b32 s11, s18
+; GFX1010-NEXT: s_mov_b32 s10, s17
+; GFX1010-NEXT: s_mov_b32 s9, s16
+; GFX1010-NEXT: s_mov_b32 s8, s7
+; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT: .LBB3_2: ; %if.end
+; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: divergent_cmp_no_metadata:
+; GFX1030: ; %bb.0: ; %entry
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1030-NEXT: s_mov_b32 s4, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e64 s21, v0
+; GFX1030-NEXT: s_cbranch_execz .LBB3_2
+; GFX1030-NEXT: ; %bb.1: ; %if.then
+; GFX1030-NEXT: v_mov_b32_e32 v0, s6
+; GFX1030-NEXT: v_mov_b32_e32 v1, s19
+; GFX1030-NEXT: s_mov_b32 s11, s18
+; GFX1030-NEXT: s_mov_b32 s10, s17
+; GFX1030-NEXT: s_mov_b32 s9, s16
+; GFX1030-NEXT: s_mov_b32 s8, s7
+; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT: .LBB3_2: ; %if.end
+; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %cmp = icmp sgt i32 %flag, %id
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @divergent_cmp_unprofitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX1010-LABEL: divergent_cmp_unprofitable:
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1010-NEXT: v_cmp_gt_i32_e32 vcc_lo, s21, v0
+; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT: s_cbranch_execz .LBB4_2
+; GFX1010-NEXT: ; %bb.1: ; %if.then
+; GFX1010-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-NEXT: v_mov_b32_e32 v1, s19
+; GFX1010-NEXT: s_mov_b32 s11, s18
+; GFX1010-NEXT: s_mov_b32 s10, s17
+; GFX1010-NEXT: s_mov_b32 s9, s16
+; GFX1010-NEXT: s_mov_b32 s8, s7
+; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT: .LBB4_2: ; %if.end
+; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: divergent_cmp_unprofitable:
+; GFX1030: ; %bb.0: ; %entry
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1030-NEXT: s_mov_b32 s4, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e64 s21, v0
+; GFX1030-NEXT: s_cbranch_execz .LBB4_2
+; GFX1030-NEXT: ; %bb.1: ; %if.then
+; GFX1030-NEXT: v_mov_b32_e32 v0, s6
+; GFX1030-NEXT: v_mov_b32_e32 v1, s19
+; GFX1030-NEXT: s_mov_b32 s11, s18
+; GFX1030-NEXT: s_mov_b32 s10, s17
+; GFX1030-NEXT: s_mov_b32 s9, s16
+; GFX1030-NEXT: s_mov_b32 s8, s7
+; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT: .LBB4_2: ; %if.end
+; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %cmp = icmp sgt i32 %flag, %id
+ br i1 %cmp, label %if.then, label %if.end, !prof !0
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+define void @divergent_cmp_profitable(i32 noundef inreg %value, ptr addrspace(8) nocapture writeonly inreg %res, i32 noundef inreg %v_offset, i32 noundef inreg %0, i32 noundef inreg %flag) {
+; GFX1010-LABEL: divergent_cmp_profitable:
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1010-NEXT: v_cmp_gt_i32_e32 vcc_lo, s21, v0
+; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1010-NEXT: s_cbranch_execz .LBB5_2
+; GFX1010-NEXT: ; %bb.1: ; %if.then
+; GFX1010-NEXT: v_mov_b32_e32 v0, s6
+; GFX1010-NEXT: v_mov_b32_e32 v1, s19
+; GFX1010-NEXT: s_mov_b32 s11, s18
+; GFX1010-NEXT: s_mov_b32 s10, s17
+; GFX1010-NEXT: s_mov_b32 s9, s16
+; GFX1010-NEXT: s_mov_b32 s8, s7
+; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1010-NEXT: .LBB5_2: ; %if.end
+; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: divergent_cmp_profitable:
+; GFX1030: ; %bb.0: ; %entry
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; GFX1030-NEXT: s_mov_b32 s4, exec_lo
+; GFX1030-NEXT: v_cmpx_gt_i32_e64 s21, v0
+; GFX1030-NEXT: s_cbranch_execz .LBB5_2
+; GFX1030-NEXT: ; %bb.1: ; %if.then
+; GFX1030-NEXT: v_mov_b32_e32 v0, s6
+; GFX1030-NEXT: v_mov_b32_e32 v1, s19
+; GFX1030-NEXT: s_mov_b32 s11, s18
+; GFX1030-NEXT: s_mov_b32 s10, s17
+; GFX1030-NEXT: s_mov_b32 s9, s16
+; GFX1030-NEXT: s_mov_b32 s8, s7
+; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX1030-NEXT: .LBB5_2: ; %if.end
+; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %cmp = icmp sgt i32 %flag, %id
+ br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %value, ptr addrspace(8) %res, i32 %v_offset, i32 0, i32 0)
+ br label %if.end
+
+if.end:
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg)
+declare void @llvm.amdgcn.s.waitcnt(i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+!0 = !{!"branch_weights", i32 1000, i32 1000}
+!1 = !{!"branch_weights", i32 2000, i32 1}
More information about the llvm-commits
mailing list