[llvm] Late temporal divergence lowering for SDAG (PR #67033)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 21 09:06:33 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
<details>
<summary>Changes</summary>
D155343 introduced performance regressions by not allowing machine instruction sinking in some cases - SWEDEV-414443.
It originally fixed SWDEV-407790
Revert D155343 to fix performance regression SWEDEV-414443
and introduce temporal divergence lowering pass that also fixes SWDEV-407790
---
Patch is 57.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/67033.diff
14 Files Affected:
- (modified) llvm/include/llvm/ADT/GenericUniformityImpl.h (+35)
- (modified) llvm/include/llvm/ADT/GenericUniformityInfo.h (+5)
- (modified) llvm/lib/Analysis/UniformityAnalysis.cpp (+6-2)
- (modified) llvm/lib/CodeGen/MachineSink.cpp (+4-11)
- (modified) llvm/lib/CodeGen/MachineUniformityAnalysis.cpp (+7-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+3)
- (added) llvm/lib/Target/AMDGPU/AMDGPUTemporalDivergenceLowering.cpp (+121)
- (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1)
- (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+11-2)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir (+1-1)
- (added) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+792)
- (modified) llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir (+1-1)
``````````diff
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index ddd0746ccd91632..ee3fd1b56bc696b 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -341,6 +341,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
using DivergenceDescriptorT =
typename SyncDependenceAnalysisT::DivergenceDescriptor;
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
+ using UseOutsideCycleInfoT =
+ typename std::tuple<ConstValueRefT, const InstructionT *,
+ SmallVector<BlockT *, 4>>;
GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI)
@@ -396,6 +399,8 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
void print(raw_ostream &out) const;
+ iterator_range<const UseOutsideCycleInfoT *> uses_outside_cycle() const;
+
protected:
/// \brief Value/block pair representing a single phi input.
struct PhiInput {
@@ -427,6 +432,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
// Recognized cycles with divergent exits.
SmallPtrSet<const CycleT *, 16> DivergentExitCycles;
+ SmallVector<UseOutsideCycleInfoT, 4> UsesOutsideCycle;
// Cycles assumed to be divergent.
//
@@ -470,6 +476,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
/// \brief Whether \p Def is divergent when read in \p ObservingBlock.
bool isTemporalDivergent(const BlockT &ObservingBlock,
const InstructionT &Def) const;
+
+ void recordUseOutsideCycle(ConstValueRefT Src, const InstructionT *UserInstr,
+ const CycleT &DefCycle);
};
template <typename ImplT>
@@ -1210,6 +1219,18 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
}
}
+template <typename ContextT>
+using UseOutsideCycleInfoT =
+ typename std::tuple<typename ContextT::ConstValueRefT,
+ const typename ContextT::InstructionT *,
+ SmallVector<typename ContextT::BlockT *, 4>>;
+
+template <typename ContextT>
+iterator_range<const UseOutsideCycleInfoT<ContextT> *>
+GenericUniformityAnalysisImpl<ContextT>::uses_outside_cycle() const {
+ return make_range(UsesOutsideCycle.begin(), UsesOutsideCycle.end());
+}
+
template <typename ContextT>
bool GenericUniformityInfo<ContextT>::hasDivergence() const {
return DA->hasDivergence();
@@ -1248,6 +1269,12 @@ void GenericUniformityInfo<ContextT>::print(raw_ostream &out) const {
DA->print(out);
}
+template <typename ContextT>
+iterator_range<const UseOutsideCycleInfoT<ContextT> *>
+GenericUniformityInfo<ContextT>::uses_outside_cycle() const {
+ return DA->uses_outside_cycle();
+}
+
template <typename ContextT>
void llvm::ModifiedPostOrder<ContextT>::computeStackPO(
SmallVectorImpl<const BlockT *> &Stack, const CycleInfoT &CI,
@@ -1367,6 +1394,14 @@ void llvm::ModifiedPostOrder<ContextT>::compute(const CycleInfoT &CI) {
computeStackPO(Stack, CI, nullptr, Finalized);
}
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<ContextT>::recordUseOutsideCycle(
+ ConstValueRefT Src, const InstructionT *UserInstr, const CycleT &DefCycle) {
+ SmallVector<BlockT *, 4> TmpExitBlocks;
+ DefCycle.getExitBlocks(TmpExitBlocks);
+ UsesOutsideCycle.push_back({Src, UserInstr, TmpExitBlocks});
+}
+
} // namespace llvm
#undef DEBUG_TYPE
diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h
index e53afccc020b469..dd91f9b34b83004 100644
--- a/llvm/include/llvm/ADT/GenericUniformityInfo.h
+++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h
@@ -39,6 +39,9 @@ template <typename ContextT> class GenericUniformityInfo {
using CycleInfoT = GenericCycleInfo<ContextT>;
using CycleT = typename CycleInfoT::CycleT;
+ using UseOutsideCycleInfoT =
+ typename std::tuple<ConstValueRefT, const InstructionT *,
+ SmallVector<BlockT *, 4>>;
GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI = nullptr);
@@ -78,6 +81,8 @@ template <typename ContextT> class GenericUniformityInfo {
void print(raw_ostream &Out) const;
+ iterator_range<const UseOutsideCycleInfoT *> uses_outside_cycle() const;
+
private:
using ImplT = GenericUniformityAnalysisImpl<ContextT>;
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 2d617db431c5888..9679d884a54f5b7 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -80,12 +80,16 @@ template <>
void llvm::GenericUniformityAnalysisImpl<
SSAContext>::propagateTemporalDivergence(const Instruction &I,
const Cycle &DefCycle) {
- if (isDivergent(I))
- return;
for (auto *User : I.users()) {
auto *UserInstr = cast<Instruction>(User);
if (DefCycle.contains(UserInstr->getParent()))
continue;
+
+ recordUseOutsideCycle(cast<Value>(&I), UserInstr, DefCycle);
+
+ if (isDivergent(I))
+ continue;
+
markDivergent(*UserInstr);
}
}
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index b4cbb93d758ef2f..ea056f6c80361b0 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -288,7 +288,8 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB,
if (!Reg)
continue;
if (MO.isUse()) {
- if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg))
+ if (Reg.isPhysical() &&
+ (TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg))))
continue;
if (PI->modifiesRegister(Reg, TRI))
return true;
@@ -1006,24 +1007,16 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
if (MBB == SuccToSinkTo)
return nullptr;
- if (!SuccToSinkTo)
- return nullptr;
-
// It's not safe to sink instructions to EH landing pad. Control flow into
// landing pad is implicitly defined.
- if (SuccToSinkTo->isEHPad())
+ if (SuccToSinkTo && SuccToSinkTo->isEHPad())
return nullptr;
// It ought to be okay to sink instructions into an INLINEASM_BR target, but
// only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in
// the source block (which this code does not yet do). So for now, forbid
// doing so.
- if (SuccToSinkTo->isInlineAsmBrIndirectTarget())
- return nullptr;
-
- MachineBasicBlock::const_iterator InsertPos =
- SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
- if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI))
+ if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
return nullptr;
return SuccToSinkTo;
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 3e0fe2b1ba087fe..0d277aaf29cf714 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -117,11 +117,16 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
if (!Op.getReg().isVirtual())
continue;
auto Reg = Op.getReg();
- if (isDivergent(Reg))
- continue;
+
for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
if (DefCycle.contains(UserInstr.getParent()))
continue;
+
+ recordUseOutsideCycle(Reg, &UserInstr, DefCycle);
+
+ if (isDivergent(Reg))
+ continue;
+
markDivergent(UserInstr);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b7101f401154706..be1b263bebf8748 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -35,6 +35,7 @@ FunctionPass *createSIAnnotateControlFlowPass();
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createAMDGPUTemporalDivergenceLoweringPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
@@ -151,6 +152,9 @@ extern char &SILowerWWMCopiesID;
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
+void initializeAMDGPUTemporalDivergenceLoweringPass(PassRegistry &);
+extern char &AMDGPUTemporalDivergenceLoweringID;
+
void initializeSILowerSGPRSpillsPass(PassRegistry &);
extern char &SILowerSGPRSpillsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 481fbaf1543a4ea..a413918ab2c8d09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -358,6 +358,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeGCNDPPCombinePass(*PR);
initializeSILowerI1CopiesPass(*PR);
+ initializeAMDGPUTemporalDivergenceLoweringPass(*PR);
initializeSILowerWWMCopiesPass(*PR);
initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
@@ -1240,6 +1241,8 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
}
void GCNPassConfig::addPreRegAlloc() {
+ addPass(createAMDGPUTemporalDivergenceLoweringPass());
+
if (LateCFGStructurize) {
addPass(createAMDGPUMachineCFGStructurizerPass());
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTemporalDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTemporalDivergenceLowering.cpp
new file mode 100644
index 000000000000000..a53d3b269ece877
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTemporalDivergenceLowering.cpp
@@ -0,0 +1,121 @@
+//===- AMDGPUTemporalDivergenceLowering.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "temporal-divergence-lowering"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUTemporalDivergenceLowering : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ AMDGPUTemporalDivergenceLowering() : MachineFunctionPass(ID) {
+ initializeAMDGPUTemporalDivergenceLoweringPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "Temporal divergence lowering";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineCycleInfoWrapperPass>();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUTemporalDivergenceLowering, DEBUG_TYPE,
+ "Temporal divergence lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AMDGPUTemporalDivergenceLowering, DEBUG_TYPE,
+ "Temporal divergence lowering", false, false)
+
+char AMDGPUTemporalDivergenceLowering::ID = 0;
+
+char &llvm::AMDGPUTemporalDivergenceLoweringID =
+ AMDGPUTemporalDivergenceLowering::ID;
+
+FunctionPass *llvm::createAMDGPUTemporalDivergenceLoweringPass() {
+ return new AMDGPUTemporalDivergenceLowering();
+}
+
+static void replaceUseRegisterWith(const MachineInstr *MI, Register Reg,
+ Register Newreg) {
+ for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+ const MachineOperand &Op = MI->getOperand(i);
+ if (Op.isReg() && Op.getReg() == Reg) {
+ const_cast<MachineInstr *>(MI)->getOperand(i).setReg(Newreg);
+ }
+ }
+}
+// Get poiners to build instruction just after MI (skips phis if needed)
+static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator>
+getInsertAfterPtrs(MachineInstr *MI) {
+ MachineBasicBlock *InsertMBB = MI->getParent();
+ return std::make_pair(
+ InsertMBB, InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator())));
+}
+
+bool AMDGPUTemporalDivergenceLowering::runOnMachineFunction(
+ MachineFunction &MF) {
+
+ MachineCycleInfo &CycleInfo =
+ getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
+ MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>();
+
+ MachineUniformityInfo MUI =
+ computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), true);
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+
+ // Temporal divergence lowering is required for uniform UniformSourceReg
+ // and divergent UserInstr. UserInstr is uniform only when loop is uniform.
+ for (auto [SrcReg, UserInstr, CycleExitBlocks] : MUI.uses_outside_cycle()) {
+ if (!MUI.isUniform(SrcReg) || !MUI.isDivergent(UserInstr))
+ continue;
+
+ MachineInstr *UniformSourceInstr = MRI.getVRegDef(SrcReg);
+
+ // FixMe: SrcReg is lane mask in this case. Find a better way to detect it.
+ if (UniformSourceInstr->getOpcode() == AMDGPU::SI_IF_BREAK ||
+ UserInstr->getOpcode() == AMDGPU::SI_IF)
+ continue;
+
+ unsigned Size = TRI.getRegSizeInBits(*MRI.getRegClassOrNull(SrcReg));
+ Register VgprDst =
+ MRI.createVirtualRegister(TRI.getVGPRClassForBitWidth(Size));
+
+ auto [MBB, AfterUniformSourceReg] = getInsertAfterPtrs(UniformSourceInstr);
+ BuildMI(*MBB, AfterUniformSourceReg, {}, TII.get(AMDGPU::COPY))
+ .addDef(VgprDst)
+ .addReg(SrcReg)
+ .addReg(AMDGPU::EXEC, RegState::Implicit);
+
+ replaceUseRegisterWith(UserInstr, SrcReg, VgprDst);
+ }
+
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 0922e8d99deb3aa..3f838ae75edaeb8 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -96,6 +96,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUTargetMachine.cpp
AMDGPUTargetObjectFile.cpp
AMDGPUTargetTransformInfo.cpp
+ AMDGPUTemporalDivergenceLowering.cpp
AMDGPUUnifyDivergentExitNodes.cpp
AMDGPUUnifyMetadata.cpp
R600MachineCFGStructurizer.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 84f67b3faac3c07..91aac981c2c9e94 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -101,10 +101,12 @@
; GCN-O0-NEXT: Finalize ISel and expand pseudo-instructions
; GCN-O0-NEXT: Local Stack Slot Allocation
; GCN-O0-NEXT: Register Usage Information Propagation
+; GCN-O0-NEXT: Machine Cycle Info Analysis
+; GCN-O0-NEXT: MachineDominator Tree Construction
+; GCN-O0-NEXT: Temporal divergence lowering
; GCN-O0-NEXT: Eliminate PHI nodes for register allocation
; GCN-O0-NEXT: SI Lower control flow pseudo instructions
; GCN-O0-NEXT: Two-Address instruction pass
-; GCN-O0-NEXT: MachineDominator Tree Construction
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
; GCN-O0-NEXT: MachinePostDominator Tree Construction
@@ -322,12 +324,13 @@
; GCN-O1-NEXT: Remove dead machine instructions
; GCN-O1-NEXT: SI Shrink Instructions
; GCN-O1-NEXT: Register Usage Information Propagation
+; GCN-O1-NEXT: MachineDominator Tree Construction
+; GCN-O1-NEXT: Temporal divergence lowering
; GCN-O1-NEXT: Detect Dead Lanes
; GCN-O1-NEXT: Remove dead machine instructions
; GCN-O1-NEXT: Process Implicit Definitions
; GCN-O1-NEXT: Remove unreachable machine basic blocks
; GCN-O1-NEXT: Live Variable Analysis
-; GCN-O1-NEXT: MachineDominator Tree Construction
; GCN-O1-NEXT: SI Optimize VGPR LiveRange
; GCN-O1-NEXT: Eliminate PHI nodes for register allocation
; GCN-O1-NEXT: SI Lower control flow pseudo instructions
@@ -616,6 +619,8 @@
; GCN-O1-OPTS-NEXT: Remove dead machine instructions
; GCN-O1-OPTS-NEXT: SI Shrink Instructions
; GCN-O1-OPTS-NEXT: Register Usage Information Propagation
+; GCN-O1-OPTS-NEXT: Machine Cycle Info Analysis
+; GCN-O1-OPTS-NEXT: Temporal divergence lowering
; GCN-O1-OPTS-NEXT: Detect Dead Lanes
; GCN-O1-OPTS-NEXT: Remove dead machine instructions
; GCN-O1-OPTS-NEXT: Process Implicit Definitions
@@ -919,6 +924,8 @@
; GCN-O2-NEXT: Remove dead machine instructions
; GCN-O2-NEXT: SI Shrink Instructions
; GCN-O2-NEXT: Register Usage Information Propagation
+; GCN-O2-NEXT: Machine Cycle Info Analysis
+; GCN-O2-NEXT: Temporal divergence lowering
; GCN-O2-NEXT: Detect Dead Lanes
; GCN-O2-NEXT: Remove dead machine instructions
; GCN-O2-NEXT: Process Implicit Definitions
@@ -1235,6 +1242,8 @@
; GCN-O3-NEXT: Remove dead machine instructions
; GCN-O3-NEXT: SI Shrink Instructions
; GCN-O3-NEXT: Register Usage Information Propagation
+; GCN-O3-NEXT: Machine Cycle Info Analysis
+; GCN-O3-NEXT: Temporal divergence lowering
; GCN-O3-NEXT: Detect Dead Lanes
; GCN-O3-NEXT: Remove dead machine instructions
; GCN-O3-NEXT: Process Implicit Definitions
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index e2456b74f7ef1fa..b8e74bc7db09a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -21,6 +21,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .LBB0_1: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4
; CHECK-NEXT: .LBB0_2: ; %Flow1
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
@@ -53,7 +54,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_add_nc_u32_e32 v4, s9, v2
; CHECK-NEXT: v_cmp_ge_u32_e64 s4, v4, v0
-; CHECK-NEXT: v_add_nc_u32_e32 v4, -4, v4
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execz .LBB0_1
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
index cc14b4a80d58a7d..037a285794120da 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
@@ -42,7 +42,6 @@ body: |
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[SI_IF1]], [[SI_IF]], implicit-def dead $scc
; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.5
@@ -52,6 +51,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4
; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/67033
More information about the llvm-commits
mailing list