[llvm] 122efef - Revert "Reapply "[CodeGen] Add new pass for late cleanup of redundant definitions.""
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 4 15:57:19 PST 2022
Author: Jonas Paulsson
Date: 2022-12-05T00:52:00+01:00
New Revision: 122efef8ee9be57055d204d52c38700fe933c033
URL: https://github.com/llvm/llvm-project/commit/122efef8ee9be57055d204d52c38700fe933c033
DIFF: https://github.com/llvm/llvm-project/commit/122efef8ee9be57055d204d52c38700fe933c033.diff
LOG: Revert "Reapply "[CodeGen] Add new pass for late cleanup of redundant definitions.""
This reverts commit 17db0de330f943833296ae72e26fa988bba39cb3.
Some more bots got broken - need to investigate.
Added:
Modified:
llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
llvm/include/llvm/CodeGen/MachinePassRegistry.def
llvm/include/llvm/CodeGen/Passes.h
llvm/include/llvm/InitializePasses.h
llvm/lib/CodeGen/CMakeLists.txt
llvm/lib/CodeGen/CodeGen.cpp
llvm/lib/CodeGen/TargetPassConfig.cpp
llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
llvm/test/CodeGen/AArch64/O3-pipeline.ll
llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
llvm/test/CodeGen/AMDGPU/cc-update.ll
llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
llvm/test/CodeGen/AMDGPU/flat-scratch.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
llvm/test/CodeGen/AMDGPU/multilevel-break.ll
llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
llvm/test/CodeGen/ARM/O3-pipeline.ll
llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
llvm/test/CodeGen/ARM/fpclamptosat.ll
llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
llvm/test/CodeGen/ARM/jump-table-islands.ll
llvm/test/CodeGen/ARM/reg_sequence.ll
llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
llvm/test/CodeGen/Mips/llvm-ir/shl.ll
llvm/test/CodeGen/PowerPC/O3-pipeline.ll
llvm/test/CodeGen/PowerPC/cgp-select.ll
llvm/test/CodeGen/PowerPC/fast-isel-branch.ll
llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll
llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
llvm/test/CodeGen/Thumb/frame-access.ll
llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
llvm/test/CodeGen/X86/2008-04-09-BranchFolding.ll
llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
llvm/test/CodeGen/X86/AMX/amx-across-func.ll
llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
llvm/test/CodeGen/X86/fast-isel-stackcheck.ll
llvm/test/CodeGen/X86/fshl.ll
llvm/test/CodeGen/X86/masked_load.ll
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/opt-pipeline.ll
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/shift-i128.ll
llvm/test/CodeGen/X86/ushl_sat_vec.ll
llvm/test/CodeGen/X86/vec_extract.ll
llvm/test/CodeGen/X86/vec_shift5.ll
llvm/test/CodeGen/XCore/scavenging.ll
Removed:
llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
llvm/test/CodeGen/SystemZ/frame-28.mir
################################################################################
diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
index c3fd2b2cc667c..48aa4b034fee0 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
@@ -1130,9 +1130,6 @@ void CodeGenPassBuilder<Derived>::addMachineLateOptimization(
if (!TM.requiresStructuredCFG())
addPass(TailDuplicatePass());
- // Cleanup of redundant (identical) address/immediate loads.
- addPass(MachineLateInstrsCleanupPass());
-
// Copy propagation.
addPass(MachineCopyPropagationPass());
}
diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
index 0bb46e405bbe2..c1ceff9680d67 100644
--- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def
+++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@@ -151,7 +151,6 @@ DUMMY_MACHINE_FUNCTION_PASS("implicit-null-checks", ImplicitNullChecksPass, ())
DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass, ())
DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass, ())
DUMMY_MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass, ())
-DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass, ())
DUMMY_MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass, ())
DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass, ())
DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass, ())
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index b121ecbd9627b..5701dd13e152c 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -334,10 +334,6 @@ namespace llvm {
MachineFunctionPass *createMachineCopyPropagationPass(bool UseCopyInstr);
- /// MachineLateInstrsCleanup - This pass removes redundant identical
- /// instructions after register allocation and rematerialization.
- extern char &MachineLateInstrsCleanupID;
-
/// PeepholeOptimizer - This pass performs peephole optimizations -
/// like extension and comparison eliminations.
extern char &PeepholeOptimizerID;
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 680cb37ce4cdf..26e2d7e596dd6 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -277,7 +277,6 @@ void initializeMachineDominanceFrontierPass(PassRegistry&);
void initializeMachineDominatorTreePass(PassRegistry&);
void initializeMachineFunctionPrinterPassPass(PassRegistry&);
void initializeMachineFunctionSplitterPass(PassRegistry &);
-void initializeMachineLateInstrsCleanupPass(PassRegistry&);
void initializeMachineLICMPass(PassRegistry&);
void initializeMachineLoopInfoPass(PassRegistry&);
void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 35d10120a311f..db3b6183b5fd2 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -119,7 +119,6 @@ add_llvm_component_library(LLVMCodeGen
MachineFunctionSplitter.cpp
MachineInstrBundle.cpp
MachineInstr.cpp
- MachineLateInstrsCleanup.cpp
MachineLICM.cpp
MachineLoopInfo.cpp
MachineLoopUtils.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index beedd94180053..a8bde3b700970 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -78,7 +78,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeMachineCycleInfoWrapperPassPass(Registry);
initializeMachineDominatorTreePass(Registry);
initializeMachineFunctionPrinterPassPass(Registry);
- initializeMachineLateInstrsCleanupPass(Registry);
initializeMachineLICMPass(Registry);
initializeMachineLoopInfoPass(Registry);
initializeMachineModuleInfoWrapperPassPass(Registry);
diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
deleted file mode 100644
index 41a0223a3ece6..0000000000000
--- a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-//==--- MachineLateInstrsCleanup.cpp - Late Instructions Cleanup Pass -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This simple pass removes any identical and redundant immediate or address
-// loads to the same register. The immediate loads removed can originally be
-// the result of rematerialization, while the addresses are redundant frame
-// addressing anchor points created during Frame Indices elimination.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "machine-latecleanup"
-
-STATISTIC(NumRemoved, "Number of redundant instructions removed.");
-
-namespace {
-
-class MachineLateInstrsCleanup : public MachineFunctionPass {
- const TargetRegisterInfo *TRI;
- const TargetInstrInfo *TII;
-
- // Data structures to map regs to their definitions per MBB.
- using Reg2DefMap = std::map<Register, MachineInstr*>;
- std::vector<Reg2DefMap> RegDefs;
-
- // Walk through the instructions in MBB and remove any redundant
- // instructions.
- bool processBlock(MachineBasicBlock *MBB);
-
-public:
- static char ID; // Pass identification, replacement for typeid
-
- MachineLateInstrsCleanup() : MachineFunctionPass(ID) {
- initializeMachineLateInstrsCleanupPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs);
- }
-};
-
-} // end anonymous namespace
-
-char MachineLateInstrsCleanup::ID = 0;
-
-char &llvm::MachineLateInstrsCleanupID = MachineLateInstrsCleanup::ID;
-
-INITIALIZE_PASS(MachineLateInstrsCleanup, DEBUG_TYPE,
- "Machine Late Instructions Cleanup Pass", false, false)
-
-bool MachineLateInstrsCleanup::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(MF.getFunction()))
- return false;
-
- bool Changed = false;
-
- TRI = MF.getSubtarget().getRegisterInfo();
- TII = MF.getSubtarget().getInstrInfo();
-
- RegDefs.clear();
- RegDefs.resize(MF.getNumBlockIDs());
-
- // Visit all MBBs in an order that maximises the reuse from predecessors.
- ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
- for (MachineBasicBlock *MBB : RPOT)
- Changed |= processBlock(MBB);
-
- return Changed;
-}
-
-// Clear any previous kill flag on Reg found before I in MBB. Walk backwards
-// in MBB and if needed continue in predecessors until a use/def of Reg is
-// encountered. This seems to be faster in practice than tracking kill flags
-// in a map.
-static void clearKillsForDef(Register Reg, MachineBasicBlock *MBB,
- MachineBasicBlock::iterator I,
- BitVector &VisitedPreds,
- const TargetRegisterInfo *TRI) {
- VisitedPreds.set(MBB->getNumber());
- while (I != MBB->begin()) {
- I--;
- bool Found = false;
- for (auto &MO : I->operands())
- if (MO.isReg() && TRI->regsOverlap(MO.getReg(), Reg)) {
- if (MO.isDef())
- return;
- if (MO.readsReg()) {
- MO.setIsKill(false);
- Found = true; // Keep going for an implicit kill of the super-reg.
- }
- }
- if (Found)
- return;
- }
-
- // If an earlier def is not in MBB, continue in predecessors.
- if (!MBB->isLiveIn(Reg))
- MBB->addLiveIn(Reg);
- assert(!MBB->pred_empty() && "Predecessor def not found!");
- for (MachineBasicBlock *Pred : MBB->predecessors())
- if (!VisitedPreds.test(Pred->getNumber()))
- clearKillsForDef(Reg, Pred, Pred->end(), VisitedPreds, TRI);
-}
-
-static void removeRedundantDef(MachineInstr *MI,
- const TargetRegisterInfo *TRI) {
- Register Reg = MI->getOperand(0).getReg();
- BitVector VisitedPreds(MI->getMF()->getNumBlockIDs());
- clearKillsForDef(Reg, MI->getParent(), MI->getIterator(), VisitedPreds, TRI);
- MI->eraseFromParent();
- ++NumRemoved;
-}
-
-// Return true if MI is a potential candidate for reuse/removal and if so
-// also the register it defines in DefedReg. A candidate is a simple
-// instruction that does not touch memory, has only one register definition
-// and the only reg it may use is FrameReg. Typically this is an immediate
-// load or a load-address instruction.
-static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
- Register FrameReg) {
- DefedReg = MCRegister::NoRegister;
- bool SawStore = true;
- if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() ||
- MI->isInlineAsm())
- return false;
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
- if (MO.isReg()) {
- if (MO.isDef()) {
- if (i == 0 && !MO.isImplicit() && !MO.isDead())
- DefedReg = MO.getReg();
- else
- return false;
- } else if (MO.getReg() && MO.getReg() != FrameReg)
- return false;
- } else if (!(MO.isImm() || MO.isCImm() || MO.isFPImm() || MO.isCPI() ||
- MO.isGlobal() || MO.isSymbol()))
- return false;
- }
- return DefedReg.isValid();
-}
-
-bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) {
- bool Changed = false;
-
- Reg2DefMap &MBBDefs = RegDefs[MBB->getNumber()];
-
- // Find reusable definitions in the predecessor(s).
- if (!MBB->pred_empty()) {
- MachineBasicBlock *FirstPred = *MBB->pred_begin();
- for (auto [Reg, DefMI] : RegDefs[FirstPred->getNumber()])
- if (llvm::all_of(
- drop_begin(MBB->predecessors()),
- [&, &Reg = Reg, &DefMI = DefMI](const MachineBasicBlock *Pred) {
- auto PredDefI = RegDefs[Pred->getNumber()].find(Reg);
- return PredDefI != RegDefs[Pred->getNumber()].end() &&
- DefMI->isIdenticalTo(*PredDefI->second);
- })) {
- MBBDefs[Reg] = DefMI;
- LLVM_DEBUG(dbgs() << "Reusable instruction from pred(s): in "
- << printMBBReference(*MBB) << ": " << *DefMI;);
- }
- }
-
- // Process MBB.
- MachineFunction *MF = MBB->getParent();
- const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
- Register FrameReg = TRI->getFrameRegister(*MF);
- for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
- // If FrameReg is modified, no previous load-address instructions are valid.
- if (MI.modifiesRegister(FrameReg, TRI)) {
- MBBDefs.clear();
- continue;
- }
-
- Register DefedReg;
- bool IsCandidate = isCandidate(&MI, DefedReg, FrameReg);
-
- // Check for an earlier identical and reusable instruction.
- if (IsCandidate) {
- auto DefI = MBBDefs.find(DefedReg);
- if (DefI != MBBDefs.end() && MI.isIdenticalTo(*DefI->second)) {
- LLVM_DEBUG(dbgs() << "Removing redundant instruction in "
- << printMBBReference(*MBB) << ": " << MI;);
- removeRedundantDef(&MI, TRI);
- Changed = true;
- continue;
- }
- }
-
- // Clear any entries in map that MI clobbers.
- for (auto DefI = MBBDefs.begin(); DefI != MBBDefs.end();) {
- Register Reg = DefI->first;
- if (MI.modifiesRegister(Reg, TRI))
- DefI = MBBDefs.erase(DefI);
- else
- ++DefI;
- }
-
- // Record this MI for potential later reuse.
- if (IsCandidate) {
- LLVM_DEBUG(dbgs() << "Found interesting instruction in "
- << printMBBReference(*MBB) << ": " << MI;);
- MBBDefs[DefedReg] = &MI;
- }
- }
-
- return Changed;
-}
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index b95d5790e10b5..59e714c4f2801 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1522,9 +1522,6 @@ void TargetPassConfig::addOptimizedRegAlloc() {
/// Add passes that optimize machine instructions after register allocation.
void TargetPassConfig::addMachineLateOptimization() {
- // Cleanup of redundant immediate/address loads.
- addPass(&MachineLateInstrsCleanupID);
-
// Branch folding must be run after regalloc and prolog/epilog insertion.
addPass(&BranchFolderPassID);
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 259ecbfacb210..56c605d997d49 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -291,7 +291,6 @@ void NVPTXPassConfig::addIRPasses() {
// of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
// NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
disablePass(&PrologEpilogCodeInserterID);
- disablePass(&MachineLateInstrsCleanupID);
disablePass(&MachineCopyPropagationID);
disablePass(&TailDuplicateID);
disablePass(&StackMapLivenessID);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 43e8c7f551ea1..dff8ccccc3a60 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -286,10 +286,6 @@ void RISCVPassConfig::addPreRegAlloc() {
void RISCVPassConfig::addPostRegAlloc() {
if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination)
addPass(createRISCVRedundantCopyEliminationPass());
-
- // Temporarily disabled until post-RA pseudo expansion problem is fixed,
- // see D123394 and D139169.
- disablePass(&MachineLateInstrsCleanupID);
}
yaml::MachineFunctionInfo *
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 497ba8d0b2f7f..ffc3de244ede1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -501,7 +501,6 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
// them.
// These functions all require the NoVRegs property.
- disablePass(&MachineLateInstrsCleanupID);
disablePass(&MachineCopyPropagationID);
disablePass(&PostRAMachineSinkingID);
disablePass(&PostRASchedulerID);
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 15b906d1eb393..90cf49e8ed8fc 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -188,7 +188,6 @@
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Shrink Wrapping analysis
; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization
-; CHECK-NEXT: Machine Late Instructions Cleanup Pass
; CHECK-NEXT: Control Flow Optimizer
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Tail Duplication
diff --git a/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll b/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
index b71e9e2de7c96..bb3397efbefa5 100644
--- a/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
@@ -29,8 +29,14 @@ define i32 @test_stack_guard_remat2() ssp {
; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr x9, [x9]
; CHECK-NEXT: str x8, [sp]
+; CHECK-NEXT: Lloh6:
+; CHECK-NEXT: adrp x8, ___stack_chk_guard at GOTPAGE
; CHECK-NEXT: stur x9, [x29, #-8]
+; CHECK-NEXT: Lloh7:
+; CHECK-NEXT: ldr x8, [x8, ___stack_chk_guard at GOTPAGEOFF]
; CHECK-NEXT: ldur x9, [x29, #-8]
+; CHECK-NEXT: Lloh8:
+; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: cmp x8, x9
; CHECK-NEXT: b.ne LBB0_2
; CHECK-NEXT: ; %bb.1: ; %entry
@@ -40,6 +46,7 @@ define i32 @test_stack_guard_remat2() ssp {
; CHECK-NEXT: ret
; CHECK-NEXT: LBB0_2: ; %entry
; CHECK-NEXT: bl ___stack_chk_fail
+; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh6, Lloh7, Lloh8
; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh1, Lloh3, Lloh5
; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh0, Lloh2, Lloh4
entry:
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 34d4612fb9257..554f9b986b23d 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -59,23 +59,26 @@ define float @foo2(double* %x0, double* %x1) nounwind {
; CHECK-NEXT: addvl sp, sp, #-4
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: add x8, sp, #16
+; CHECK-NEXT: add x9, sp, #16
; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0]
; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: fmov s0, #1.00000000
; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: mov w1, #1
; CHECK-NEXT: mov w2, #2
+; CHECK-NEXT: st1d { z16.d }, p0, [x9]
+; CHECK-NEXT: add x9, sp, #16
; CHECK-NEXT: mov w3, #3
; CHECK-NEXT: mov w4, #4
; CHECK-NEXT: mov w5, #5
; CHECK-NEXT: mov w6, #6
-; CHECK-NEXT: mov w7, #7
-; CHECK-NEXT: add x9, sp, #16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: st1d { z16.d }, p0, [x9]
; CHECK-NEXT: st1d { z17.d }, p0, [x9, #1, mul vl]
+; CHECK-NEXT: add x9, sp, #16
+; CHECK-NEXT: mov w7, #7
; CHECK-NEXT: st1d { z18.d }, p0, [x9, #2, mul vl]
+; CHECK-NEXT: add x9, sp, #16
; CHECK-NEXT: st1d { z19.d }, p0, [x9, #3, mul vl]
; CHECK-NEXT: str x8, [sp]
; CHECK-NEXT: bl callee2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 7ed11c7abbb6b..37592a7f99ee9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -157,6 +157,8 @@ define amdgpu_kernel void @kernel_caller_byval() {
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16
; FLATSCR-NEXT: s_mov_b32 s11, 0
; FLATSCR-NEXT: s_mov_b32 s10, 0
; FLATSCR-NEXT: s_mov_b32 s9, 0
@@ -169,8 +171,9 @@ define amdgpu_kernel void @kernel_caller_byval() {
; FLATSCR-NEXT: s_mov_b32 s4, 0
; FLATSCR-NEXT: s_mov_b32 s3, 0
; FLATSCR-NEXT: s_mov_b32 s2, 0
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16
+; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0
+; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT: s_mov_b32 s40, 0
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s11 offset:24
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s10 offset:32
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s9 offset:40
@@ -185,7 +188,6 @@ define amdgpu_kernel void @kernel_caller_byval() {
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2 offset:112
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_lo offset:120
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], vcc_hi offset:128
-; FLATSCR-NEXT: s_mov_b32 s40, 0
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s40 offset:8
; FLATSCR-NEXT: s_mov_b32 s39, 0
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s39 offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 5f71713f4a6c4..d8705bffe7e90 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1354,6 +1354,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7-NEXT: s_cbranch_execz .LBB13_2
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index a3e22a3bc7db6..83647a04467f0 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -537,6 +537,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
; GFX803-NEXT: ;;#ASMSTART
; GFX803-NEXT: ;;#ASMEND
+; GFX803-NEXT: s_mov_b32 s4, 0x40000
; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
@@ -553,6 +554,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: s_mov_b32 s4, 0x40000
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
@@ -567,6 +569,8 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
; GFX1010-NEXT: s_waitcnt vmcnt(0)
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
+; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-NEXT: s_mov_b32 s4, 0x20000
; GFX1010-NEXT: ;;#ASMSTART
; GFX1010-NEXT: ;;#ASMEND
; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
@@ -581,6 +585,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
; GFX1100-NEXT: s_waitcnt vmcnt(0)
; GFX1100-NEXT: s_movk_i32 s0, 0x1000
; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill
+; GFX1100-NEXT: s_movk_i32 s0, 0x1000
; GFX1100-NEXT: ;;#ASMSTART
; GFX1100-NEXT: ;;#ASMEND
; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 3f50130cce480..b4c00f331eedd 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -76,10 +76,12 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
; CHECK-NEXT: ; %bb.10: ; %bb16
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_mov_b64 s[16:17], 0
+; CHECK-NEXT: s_mov_b64 s[20:21], -1
; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11]
; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17]
; CHECK-NEXT: s_branch .LBB0_2
; CHECK-NEXT: .LBB0_11: ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: s_mov_b64 s[22:23], -1
; CHECK-NEXT: s_mov_b64 s[20:21], 0
; CHECK-NEXT: ; implicit-def: $sgpr16_sgpr17
; CHECK-NEXT: s_mov_b64 s[18:19], s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 06ee6b4998eaf..c0590ce38f28c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -22,12 +22,18 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_mov_b32 vcc_lo, 0
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:52
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zero_init_kernel:
@@ -37,6 +43,7 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 4
; GFX10-NEXT: s_mov_b32 s1, s0
; GFX10-NEXT: s_mov_b32 s2, s0
; GFX10-NEXT: s_mov_b32 s3, s0
@@ -48,12 +55,15 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v4
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: zero_init_kernel:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, 4
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@@ -64,6 +74,9 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:36
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:20
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v4
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
@@ -87,10 +100,16 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-PAL-NEXT: s_mov_b32 s1, 0
+; GFX9-PAL-NEXT: s_mov_b32 s0, 0
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:52
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4
+; GFX9-PAL-NEXT: s_nop 0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: zero_init_kernel:
@@ -105,6 +124,11 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX1010-PAL-LABEL: zero_init_kernel:
@@ -129,10 +153,15 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
; GFX1010-PAL-NEXT: s_mov_b32 s2, 0
; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
+; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:52
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:36
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:4
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v4
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: zero_init_kernel:
@@ -147,6 +176,7 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4
; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
@@ -158,12 +188,15 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v4
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: zero_init_kernel:
; GFX11-PAL: ; %bb.0:
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
-; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v4, 4
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@@ -174,11 +207,15 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:36
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:20
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v4
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PAL-NEXT: s_endpgm
%alloca = alloca [32 x i16], align 2, addrspace(5)
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+ call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
ret void
}
@@ -198,6 +235,11 @@ define void @zero_init_foo() {
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s32
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -206,6 +248,7 @@ define void @zero_init_foo() {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s32
; GFX10-NEXT: s_mov_b32 s1, s0
; GFX10-NEXT: s_mov_b32 s2, s0
; GFX10-NEXT: s_mov_b32 s3, s0
@@ -217,6 +260,9 @@ define void @zero_init_foo() {
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v4
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -225,7 +271,7 @@ define void @zero_init_foo() {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, s32
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@@ -236,6 +282,9 @@ define void @zero_init_foo() {
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v4
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -254,6 +303,11 @@ define void @zero_init_foo() {
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32
+; GFX9-PAL-NEXT: s_nop 0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
;
@@ -270,6 +324,11 @@ define void @zero_init_foo() {
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v0, s32
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
@@ -278,6 +337,7 @@ define void @zero_init_foo() {
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: s_mov_b32 s0, 0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s32
; GFX10-PAL-NEXT: s_mov_b32 s1, s0
; GFX10-PAL-NEXT: s_mov_b32 s2, s0
; GFX10-PAL-NEXT: s_mov_b32 s3, s0
@@ -289,6 +349,9 @@ define void @zero_init_foo() {
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v4
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
;
@@ -297,7 +360,7 @@ define void @zero_init_foo() {
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
-; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v4, s32
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@@ -308,26 +371,15 @@ define void @zero_init_foo() {
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v4
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: zero_init_foo:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, 0
-; GCN-NEXT: s_mov_b32 s1, s0
-; GCN-NEXT: s_mov_b32 s2, s0
-; GCN-NEXT: s_mov_b32 s3, s0
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48
-; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
-; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
-; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca [32 x i16], align 2, addrspace(5)
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+ call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
ret void
}
@@ -348,6 +400,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_sindex_kernel:
@@ -368,6 +424,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_sindex_kernel:
@@ -384,6 +444,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_sindex_kernel:
@@ -406,6 +470,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_sindex_kernel:
@@ -422,6 +490,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX940-NEXT: s_add_i32 s0, s0, 4
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX10-PAL-LABEL: store_load_sindex_kernel:
@@ -447,6 +519,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v0
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_sindex_kernel:
@@ -463,22 +539,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
-; GCN-LABEL: store_load_sindex_kernel:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v0, 15
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s1, s0, 2
-; GCN-NEXT: s_and_b32 s0, s0, 15
-; GCN-NEXT: s_lshl_b32 s0, s0, 2
-; GCN-NEXT: s_add_u32 s1, 4, s1
-; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_add_u32 s0, 4, s0
-; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_endpgm
bb:
%i = alloca [32 x float], align 4, addrspace(5)
%i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -489,6 +554,7 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -507,6 +573,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_sindex_foo:
@@ -525,6 +595,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_sindex_foo:
@@ -539,6 +613,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_sindex_foo:
@@ -560,6 +638,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_sindex_foo:
@@ -574,6 +656,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
; GFX940-NEXT: s_add_i32 s0, s0, 4
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX10-PAL-LABEL: store_load_sindex_foo:
@@ -597,6 +683,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v0
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_sindex_foo:
@@ -611,20 +701,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
-; GCN-LABEL: store_load_sindex_foo:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_lshl_b32 s1, s0, 2
-; GCN-NEXT: s_and_b32 s0, s0, 15
-; GCN-NEXT: s_lshl_b32 s0, s0, 2
-; GCN-NEXT: s_add_u32 s1, 4, s1
-; GCN-NEXT: v_mov_b32_e32 v0, 15
-; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_add_u32 s0, 4, s0
-; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_endpgm
bb:
%i = alloca [32 x float], align 4, addrspace(5)
%i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -635,6 +716,7 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -651,6 +733,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_vindex_kernel:
@@ -667,6 +753,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_vindex_kernel:
@@ -678,6 +768,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_vindex_kernel:
@@ -697,6 +791,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_vindex_kernel:
@@ -708,6 +806,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0
; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX10-PAL-LABEL: store_load_vindex_kernel:
@@ -729,6 +831,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v0
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_vindex_kernel:
@@ -740,17 +846,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
-; GCN-LABEL: store_load_vindex_kernel:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 15
-; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_sub_u32_e32 v0, 4, v0
-; GCN-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_endpgm
bb:
%i = alloca [32 x float], align 4, addrspace(5)
%i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -763,6 +863,7 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -779,6 +880,9 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v1
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: store_load_vindex_foo:
@@ -793,6 +897,10 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s32
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: store_load_vindex_foo:
@@ -807,6 +915,10 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s32
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_vindex_foo:
@@ -821,6 +933,9 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v1
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: store_load_vindex_foo:
@@ -834,6 +949,10 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, s32
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-PAL-LABEL: store_load_vindex_foo:
@@ -848,6 +967,10 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s32
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v0
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-PAL-LABEL: store_load_vindex_foo:
@@ -862,19 +985,11 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s32
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_vindex_foo:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, 15
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT: v_and_b32_e32 v0, v0, v2
-; GCN-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [32 x float], align 4, addrspace(5)
%i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -885,6 +1000,7 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -948,13 +1064,6 @@ define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: private_ptr_foo:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, 0x41200000
-; GCN-NEXT: scratch_store_dword v0, v1, off offset:4
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
store float 1.000000e+01, float addrspace(5)* %gep, align 4
ret void
@@ -977,12 +1086,22 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_mov_b32 vcc_lo, 0
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:260
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zero_init_small_offset_kernel:
@@ -994,6 +1113,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 4
; GFX10-NEXT: s_mov_b32 s1, s0
; GFX10-NEXT: s_mov_b32 s2, s0
; GFX10-NEXT: s_mov_b32 s3, s0
@@ -1001,10 +1121,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: v_mov_b32_e32 v5, 0x104
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v4
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v5
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: zero_init_small_offset_kernel:
@@ -1012,7 +1139,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@@ -1023,6 +1150,12 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:276
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:292
; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:308
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v4
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v5
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
@@ -1049,10 +1182,20 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-PAL-NEXT: s_mov_b32 s1, 0
+; GFX9-PAL-NEXT: s_mov_b32 s0, 0
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:260
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308
+; GFX9-PAL-NEXT: s_nop 0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: zero_init_small_offset_kernel:
@@ -1069,6 +1212,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1095,11 +1247,20 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
; GFX1010-PAL-NEXT: s_mov_b32 s2, 0
; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
+; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:260
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v5, 0x104
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:276
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:308
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v4
+; GFX1010-PAL-NEXT: ;;#ASMEND
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v5
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1116,6 +1277,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4
; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
@@ -1123,10 +1285,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1
; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2
; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v5, 0x104
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v4
+; GFX1030-PAL-NEXT: ;;#ASMEND
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v5
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1134,7 +1303,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
-; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@@ -1145,6 +1314,12 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:276
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:292
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:308
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v4
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v5
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PAL-NEXT: s_endpgm
%padding = alloca [64 x i32], align 4, addrspace(5)
@@ -1153,6 +1328,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
%pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+ call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
ret void
}
@@ -1174,6 +1351,15 @@ define void @zero_init_small_offset_foo() {
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100
+; GFX9-NEXT: v_mov_b32_e32 v0, s32
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1184,6 +1370,7 @@ define void @zero_init_small_offset_foo() {
; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-NEXT: s_mov_b32 s1, s0
; GFX10-NEXT: s_mov_b32 s2, s0
; GFX10-NEXT: s_mov_b32 s3, s0
@@ -1191,10 +1378,18 @@ define void @zero_init_small_offset_foo() {
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: v_mov_b32_e32 v4, s32
+; GFX10-NEXT: v_mov_b32_e32 v5, vcc_lo
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v4
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v5
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1205,17 +1400,24 @@ define void @zero_init_small_offset_foo() {
; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v4
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v5
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -1236,6 +1438,15 @@ define void @zero_init_small_offset_foo() {
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1254,6 +1465,16 @@ define void @zero_init_small_offset_foo() {
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, s32
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
@@ -1264,6 +1485,7 @@ define void @zero_init_small_offset_foo() {
; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX10-PAL-NEXT: s_mov_b32 s0, 0
+; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-PAL-NEXT: s_mov_b32 s1, s0
; GFX10-PAL-NEXT: s_mov_b32 s2, s0
; GFX10-PAL-NEXT: s_mov_b32 s3, s0
@@ -1271,10 +1493,18 @@ define void @zero_init_small_offset_foo() {
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s32
+; GFX10-PAL-NEXT: v_mov_b32_e32 v5, vcc_lo
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v4
+; GFX10-PAL-NEXT: ;;#ASMEND
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v5
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1285,42 +1515,34 @@ define void @zero_init_small_offset_foo() {
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
-; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-PAL-NEXT: v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo
; GFX11-PAL-NEXT: s_clause 0x3
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v4
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v5
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: zero_init_small_offset_foo:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v0, off, s32 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, 0
-; GCN-NEXT: s_mov_b32 s1, s0
-; GCN-NEXT: s_mov_b32 s2, s0
-; GCN-NEXT: s_mov_b32 s3, s0
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
-; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
-; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
-; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [64 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
%pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
%pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+ call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
ret void
}
@@ -1343,6 +1565,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_addk_i32 s0, 0x104
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_sindex_small_offset_kernel:
@@ -1355,6 +1585,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x104
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s1, s0, 15
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
@@ -1365,6 +1596,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v1
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_sindex_small_offset_kernel:
@@ -1372,7 +1610,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s1, s0, 15
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
@@ -1383,6 +1621,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v1
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
@@ -1408,6 +1653,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104
; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_sindex_small_offset_kernel:
@@ -1426,6 +1679,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX940-NEXT: s_addk_i32 s0, 0x104
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
@@ -1441,6 +1702,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104
; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
@@ -1454,6 +1716,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v0
+; GFX1010-PAL-NEXT: ;;#ASMEND
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v1
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
@@ -1471,6 +1740,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104
; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
@@ -1481,6 +1751,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v0
+; GFX1030-PAL-NEXT: ;;#ASMEND
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v1
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel:
@@ -1488,7 +1765,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104
; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
@@ -1499,6 +1776,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v1
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
@@ -1513,6 +1797,8 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -1534,6 +1820,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
; GFX9-NEXT: s_addk_i32 s0, 0x104
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_sindex_small_offset_foo:
@@ -1554,13 +1848,21 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v1
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_sindex_small_offset_foo:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104
; GFX11-NEXT: s_and_b32 s1, s0, 15
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
@@ -1570,6 +1872,13 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v1
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
@@ -1594,6 +1903,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104
; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_sindex_small_offset_foo:
@@ -1610,9 +1927,17 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
; GFX940-NEXT: s_addk_i32 s0, 0x104
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: s_endpgm
-;
-; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: s_endpgm
+;
+; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
; GFX1010-PAL: ; %bb.0: ; %bb
; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
@@ -1636,6 +1961,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v0
+; GFX1010-PAL-NEXT: ;;#ASMEND
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v1
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
@@ -1661,13 +1994,21 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v0
+; GFX1030-PAL-NEXT: ;;#ASMEND
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v1
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo:
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104
; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2
@@ -1677,6 +2018,13 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v1
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
@@ -1691,6 +2039,8 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -1710,6 +2060,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX9-NEXT: v_mov_b32_e32 v1, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v1
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_vindex_small_offset_kernel:
@@ -1728,6 +2086,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v1
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_vindex_small_offset_kernel:
@@ -1738,8 +2104,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0
; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x104
; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v1
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
@@ -1762,6 +2136,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v1
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_vindex_small_offset_kernel:
@@ -1775,6 +2157,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0
; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 0x104
+; GFX940-NEXT: v_mov_b32_e32 v1, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v1
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
@@ -1799,6 +2189,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v0
+; GFX1010-PAL-NEXT: ;;#ASMEND
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v1
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
@@ -1822,6 +2220,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v0
+; GFX1030-PAL-NEXT: ;;#ASMEND
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v1
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
@@ -1832,8 +2238,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0
; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x104
; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v1
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
@@ -1850,6 +2264,8 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -1869,6 +2285,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s32
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v1
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: store_load_vindex_small_offset_foo:
@@ -1876,17 +2299,26 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
+; GFX10-NEXT: s_add_i32 s1, s32, 0x100
; GFX10-NEXT: s_add_i32 s0, s32, 0x100
-; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100
-; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX10-NEXT: v_mov_b32_e32 v2, 15
-; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s0
; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s32
+; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v1
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: store_load_vindex_small_offset_foo:
@@ -1894,6 +2326,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1902,6 +2335,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v1
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
@@ -1919,6 +2359,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s32
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v1
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: store_load_vindex_small_offset_foo:
@@ -1934,6 +2381,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, s32
+; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x100
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
@@ -1941,17 +2397,26 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0
+; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x100
; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x100
-; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100
-; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s1
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s0
; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s32
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v0
+; GFX10-PAL-NEXT: ;;#ASMEND
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v1
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo:
@@ -1959,6 +2424,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -1967,21 +2433,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v1
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_vindex_small_offset_foo:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, 15
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT: v_and_b32_e32 v0, v0, v2
-; GCN-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -1995,6 +2454,8 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -2022,6 +2483,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zero_init_large_offset_kernel:
@@ -2044,10 +2514,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX10-NEXT: s_movk_i32 s2, 0x4004
; GFX10-NEXT: s_movk_i32 s1, 0x4004
; GFX10-NEXT: s_movk_i32 s0, 0x4004
+; GFX10-NEXT: v_mov_b32_e32 v4, 4
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s2
+; GFX10-NEXT: v_mov_b32_e32 v5, 0x4004
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v4
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v5
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: zero_init_large_offset_kernel:
@@ -2064,11 +2542,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX11-NEXT: s_movk_i32 s2, 0x4004
; GFX11-NEXT: s_movk_i32 s1, 0x4004
; GFX11-NEXT: s_movk_i32 s0, 0x4004
+; GFX11-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32
; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v4
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v5
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
@@ -2100,6 +2585,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-PAL-NEXT: s_nop 0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: zero_init_large_offset_kernel:
@@ -2120,6 +2614,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
@@ -2149,9 +2652,17 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x4004
; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v4, 4
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v5, 0x4004
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v4
+; GFX1010-PAL-NEXT: ;;#ASMEND
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v5
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
@@ -2179,10 +2690,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX1030-PAL-NEXT: s_movk_i32 s2, 0x4004
; GFX1030-PAL-NEXT: s_movk_i32 s1, 0x4004
; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x4004
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v4, 4
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v5, 0x4004
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v4
+; GFX1030-PAL-NEXT: ;;#ASMEND
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v5
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: zero_init_large_offset_kernel:
@@ -2199,11 +2718,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX11-PAL-NEXT: s_movk_i32 s2, 0x4004
; GFX11-PAL-NEXT: s_movk_i32 s1, 0x4004
; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004
+; GFX11-PAL-NEXT: v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004
; GFX11-PAL-NEXT: s_clause 0x3
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s2
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v4
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v5
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PAL-NEXT: s_endpgm
%padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2212,6 +2738,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
%pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+ call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
ret void
}
@@ -2229,14 +2757,24 @@ define void @zero_init_large_offset_foo() {
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: s_add_i32 s3, s32, 0x4004
+; GFX9-NEXT: s_add_i32 s2, s32, 0x4004
; GFX9-NEXT: s_add_i32 s1, s32, 0x4004
; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX9-NEXT: s_add_i32 vcc_lo, s32, 0x4004
+; GFX9-NEXT: s_add_i32 vcc_lo, s32, 4
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s3
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004
-; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s1
-; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
-; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-NEXT: v_mov_b32_e32 v0, vcc_lo
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -2247,6 +2785,7 @@ define void @zero_init_large_offset_foo() {
; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_add_i32 s4, s32, 0x4004
; GFX10-NEXT: s_mov_b32 s1, s0
; GFX10-NEXT: s_mov_b32 s2, s0
; GFX10-NEXT: s_mov_b32 s3, s0
@@ -2254,14 +2793,23 @@ define void @zero_init_large_offset_foo() {
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_add_i32 s3, s32, 4
; GFX10-NEXT: s_add_i32 s2, s32, 0x4004
; GFX10-NEXT: s_add_i32 s1, s32, 0x4004
; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s2
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s4
+; GFX10-NEXT: v_mov_b32_e32 v4, s3
+; GFX10-NEXT: v_mov_b32_e32 v5, s2
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v4
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v5
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -2272,21 +2820,29 @@ define void @zero_init_large_offset_foo() {
; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s4, s32, 4
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: s_add_i32 s3, s32, 0x4004
; GFX11-NEXT: s_add_i32 s2, s32, 0x4004
; GFX11-NEXT: s_add_i32 s1, s32, 0x4004
; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32
; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v4
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v5
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -2303,14 +2859,24 @@ define void @zero_init_large_offset_foo() {
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT: s_add_i32 s3, s32, 0x4004
+; GFX9-PAL-NEXT: s_add_i32 s2, s32, 0x4004
; GFX9-PAL-NEXT: s_add_i32 s1, s32, 0x4004
; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
+; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 4
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s3
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004
-; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1
-; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
-; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2325,14 +2891,25 @@ define void @zero_init_large_offset_foo() {
; GFX940-NEXT: s_mov_b32 s3, s0
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: s_add_i32 s3, s32, 0x4004
+; GFX940-NEXT: s_add_i32 s2, s32, 0x4004
; GFX940-NEXT: s_add_i32 s1, s32, 0x4004
; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004
+; GFX940-NEXT: s_add_i32 vcc_lo, s32, 4
+; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s3
+; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s2 offset:16
+; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004
-; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s1
-; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
-; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, vcc_lo
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
@@ -2343,6 +2920,7 @@ define void @zero_init_large_offset_foo() {
; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX10-PAL-NEXT: s_mov_b32 s0, 0
+; GFX10-PAL-NEXT: s_add_i32 s4, s32, 0x4004
; GFX10-PAL-NEXT: s_mov_b32 s1, s0
; GFX10-PAL-NEXT: s_mov_b32 s2, s0
; GFX10-PAL-NEXT: s_mov_b32 s3, s0
@@ -2350,14 +2928,23 @@ define void @zero_init_large_offset_foo() {
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT: s_add_i32 s3, s32, 4
; GFX10-PAL-NEXT: s_add_i32 s2, s32, 0x4004
; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x4004
; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004
; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s2
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s4
+; GFX10-PAL-NEXT: v_mov_b32_e32 v4, s3
+; GFX10-PAL-NEXT: v_mov_b32_e32 v5, s2
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v4
+; GFX10-PAL-NEXT: ;;#ASMEND
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v5
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2368,21 +2955,29 @@ define void @zero_init_large_offset_foo() {
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
-; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT: s_add_i32 s4, s32, 4
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-PAL-NEXT: s_add_i32 s3, s32, 0x4004
; GFX11-PAL-NEXT: s_add_i32 s2, s32, 0x4004
; GFX11-PAL-NEXT: s_add_i32 s1, s32, 0x4004
; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004
; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
+; GFX11-PAL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3
; GFX11-PAL-NEXT: s_clause 0x3
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s2
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s1 offset:16
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32
; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v4
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v5
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2391,6 +2986,8 @@ define void @zero_init_large_offset_foo() {
%pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+ call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
ret void
}
@@ -2413,6 +3010,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX9-NEXT: s_addk_i32 s0, 0x4004
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_sindex_large_offset_kernel:
@@ -2425,6 +3030,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s1, s0, 15
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
@@ -2435,6 +3041,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v1
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_sindex_large_offset_kernel:
@@ -2442,7 +3055,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s1, s0, 15
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
@@ -2453,6 +3066,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v1
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
@@ -2478,6 +3098,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004
; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_sindex_large_offset_kernel:
@@ -2496,6 +3124,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX940-NEXT: s_addk_i32 s0, 0x4004
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
@@ -2511,6 +3147,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
@@ -2524,6 +3161,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v0
+; GFX1010-PAL-NEXT: ;;#ASMEND
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v1
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
@@ -2541,6 +3185,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
@@ -2551,6 +3196,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v0
+; GFX1030-PAL-NEXT: ;;#ASMEND
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v1
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel:
@@ -2558,7 +3210,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004
; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
@@ -2569,6 +3221,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v1
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2583,6 +3242,8 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -2604,6 +3265,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
; GFX9-NEXT: s_addk_i32 s0, 0x4004
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_sindex_large_offset_foo:
@@ -2624,13 +3293,21 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v1
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_sindex_large_offset_foo:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004
; GFX11-NEXT: s_and_b32 s1, s0, 15
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
@@ -2640,6 +3317,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v1
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
@@ -2664,6 +3348,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004
; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_sindex_large_offset_foo:
@@ -2680,6 +3372,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
; GFX940-NEXT: s_addk_i32 s0, 0x4004
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
@@ -2706,6 +3406,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v0
+; GFX1010-PAL-NEXT: ;;#ASMEND
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v1
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
@@ -2731,13 +3439,21 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v0
+; GFX1030-PAL-NEXT: ;;#ASMEND
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v1
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo:
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004
; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2
@@ -2747,6 +3463,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v1
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2761,6 +3484,8 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -2780,6 +3505,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX9-NEXT: v_mov_b32_e32 v1, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v1
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_vindex_large_offset_kernel:
@@ -2798,6 +3531,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v1
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_vindex_large_offset_kernel:
@@ -2809,8 +3550,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0
; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v1
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
@@ -2833,6 +3582,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v1
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_vindex_large_offset_kernel:
@@ -2847,6 +3604,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0
; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 0x4004
+; GFX940-NEXT: v_mov_b32_e32 v1, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v1
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
@@ -2871,6 +3636,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v0
+; GFX1010-PAL-NEXT: ;;#ASMEND
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v1
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
@@ -2894,6 +3667,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v0
+; GFX1030-PAL-NEXT: ;;#ASMEND
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v1
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
@@ -2905,8 +3686,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0
; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v1
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2923,6 +3712,8 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -2932,8 +3723,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004
-; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi
+; GFX9-NEXT: s_add_i32 vcc_lo, s32, 0x4004
+; GFX9-NEXT: v_mov_b32_e32 v1, vcc_lo
; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
; GFX9-NEXT: v_mov_b32_e32 v3, 15
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
@@ -2942,6 +3733,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4
+; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v1
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: store_load_vindex_large_offset_foo:
@@ -2949,17 +3748,27 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
-; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-NEXT: s_add_i32 s2, s32, 0x4004
+; GFX10-NEXT: s_add_i32 s1, s32, 0x4004
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-NEXT: v_mov_b32_e32 v2, 15
-; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s1
; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_add_i32 s0, s32, 4
+; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v1
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: store_load_vindex_large_offset_foo:
@@ -2967,17 +3776,25 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
+; GFX11-NEXT: s_add_i32 s2, s32, 0x4004
+; GFX11-NEXT: s_add_i32 s1, s32, 0x4004
+; GFX11-NEXT: s_add_i32 s0, s32, 4
; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc
+; GFX11-NEXT: scratch_store_b32 v0, v2, s2 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc
+; GFX11-NEXT: scratch_load_b32 v0, v1, s1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v1
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
@@ -2985,8 +3802,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004
-; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi
+; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo
; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
@@ -2995,6 +3812,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v1
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: store_load_vindex_large_offset_foo:
@@ -3004,14 +3829,24 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX940-NEXT: v_mov_b32_e32 v2, 15
-; GFX940-NEXT: s_add_i32 vcc_lo, s32, 0x4004
+; GFX940-NEXT: s_add_i32 s1, s32, 0x4004
; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
-; GFX940-NEXT: scratch_store_dword v1, v2, vcc_lo sc0 sc1
+; GFX940-NEXT: scratch_store_dword v1, v2, s1 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004
-; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1
+; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
+; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_add_i32 vcc_lo, s32, 4
+; GFX940-NEXT: v_mov_b32_e32 v0, vcc_lo
+; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
@@ -3019,17 +3854,27 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0
-; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004
-; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-PAL-NEXT: s_add_i32 s2, s32, 0x4004
+; GFX10-PAL-NEXT: s_add_i32 s1, s32, 0x4004
+; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s2
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s1
; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_add_i32 s0, s32, 4
+; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v0
+; GFX10-PAL-NEXT: ;;#ASMEND
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v1
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo:
@@ -3037,34 +3882,26 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004
+; GFX11-PAL-NEXT: s_add_i32 s2, s32, 0x4004
+; GFX11-PAL-NEXT: s_add_i32 s1, s32, 0x4004
+; GFX11-PAL-NEXT: s_add_i32 s0, s32, 4
; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
-; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s0 dlc
+; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s2 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc
+; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s1 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v1
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_vindex_large_offset_foo:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, 15
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT: v_and_b32_e32 v0, v0, v2
-; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GCN-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GCN-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -3078,6 +3915,8 @@ bb:
%i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
%i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
%i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+ call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+ call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
ret void
}
@@ -3097,6 +3936,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_large_imm_offset_kernel:
@@ -3115,6 +3958,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_large_imm_offset_kernel:
@@ -3127,6 +3974,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
@@ -3149,6 +4000,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_large_imm_offset_kernel:
@@ -3162,6 +4017,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
@@ -3186,6 +4045,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT: ;;#ASMSTART
+; GFX1010-PAL-NEXT: ; use v0
+; GFX1010-PAL-NEXT: ;;#ASMEND
; GFX1010-PAL-NEXT: s_endpgm
;
; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
@@ -3209,6 +4072,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT: ;;#ASMSTART
+; GFX1030-PAL-NEXT: ; use v0
+; GFX1030-PAL-NEXT: ;;#ASMEND
; GFX1030-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel:
@@ -3221,6 +4088,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
bb:
%i = alloca [4096 x i32], align 4, addrspace(5)
@@ -3230,6 +4101,7 @@ bb:
store volatile i32 15, i32 addrspace(5)* %i7, align 4
%i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
%i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
+ call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0
ret void
}
@@ -3239,15 +4111,20 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 13
; GFX9-NEXT: s_movk_i32 s0, 0x3000
-; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4
+; GFX9-NEXT: s_add_i32 vcc_lo, s32, 4
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi
+; GFX9-NEXT: s_add_i32 s0, s0, vcc_lo
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4
+; GFX9-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v0
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: store_load_large_imm_offset_foo:
@@ -3257,14 +4134,19 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
; GFX10-NEXT: s_movk_i32 s0, 0x3800
-; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4
-; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo
+; GFX10-NEXT: s_add_i32 s1, s32, 4
+; GFX10-NEXT: s_add_i32 s0, s0, s1
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4
+; GFX10-NEXT: v_mov_b32_e32 v0, vcc_lo
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: store_load_large_imm_offset_foo:
@@ -3273,12 +4155,17 @@ define void @store_load_large_imm_offset_foo() {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
; GFX11-NEXT: v_mov_b32_e32 v2, 15
+; GFX11-NEXT: s_add_i32 vcc_lo, s32, 4
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, vcc_lo
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
@@ -3286,15 +4173,20 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
-; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4
+; GFX9-PAL-NEXT: s_add_i32 vcc_lo, s32, 4
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi
+; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_lo
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v0
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: store_load_large_imm_offset_foo:
@@ -3309,6 +4201,11 @@ define void @store_load_large_imm_offset_foo() {
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_add_i32 vcc_hi, s32, 4
+; GFX940-NEXT: v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
@@ -3318,14 +4215,19 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
-; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4
-; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo
+; GFX10-PAL-NEXT: s_add_i32 s1, s32, 4
+; GFX10-PAL-NEXT: s_add_i32 s0, s0, s1
; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v0
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
@@ -3334,26 +4236,18 @@ define void @store_load_large_imm_offset_foo() {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15
+; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 4
; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, vcc_lo
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_large_imm_offset_foo:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, 13
-; GCN-NEXT: scratch_store_dword off, v0, s32 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, 0x3000
-; GCN-NEXT: v_mov_b32_e32 v1, 15
-; GCN-NEXT: scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [4096 x i32], align 4, addrspace(5)
%i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
@@ -3362,6 +4256,7 @@ bb:
store volatile i32 15, i32 addrspace(5)* %i7, align 4
%i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
%i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
+ call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0
ret void
}
@@ -3372,14 +4267,17 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; GFX9-NEXT: v_mov_b32_e32 v1, 4
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 15
-; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024
+; GFX9-NEXT: scratch_store_dword v0, v2, off offset:1024
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ;;#ASMSTART
+; GFX9-NEXT: ; use v1
+; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: store_load_vidx_sidx_offset:
@@ -3397,6 +4295,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_vidx_sidx_offset:
@@ -3409,6 +4311,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
@@ -3418,17 +4324,20 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4
; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024
+; GFX9-PAL-NEXT: scratch_store_dword v0, v2, off offset:1024
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: ;;#ASMSTART
+; GFX9-PAL-NEXT: ; use v1
+; GFX9-PAL-NEXT: ;;#ASMEND
; GFX9-PAL-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_vidx_sidx_offset:
@@ -3441,6 +4350,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, 4
+; GFX940-NEXT: ;;#ASMSTART
+; GFX940-NEXT: ; use v0
+; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_endpgm
;
; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
@@ -3463,6 +4376,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX10-PAL-NEXT: ;;#ASMSTART
+; GFX10-PAL-NEXT: ; use v0
+; GFX10-PAL-NEXT: ;;#ASMEND
; GFX10-PAL-NEXT: s_endpgm
;
; GFX11-PAL-LABEL: store_load_vidx_sidx_offset:
@@ -3475,18 +4392,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT: ;;#ASMSTART
+; GFX11-PAL-NEXT: ; use v0
+; GFX11-PAL-NEXT: ;;#ASMEND
; GFX11-PAL-NEXT: s_endpgm
-; GCN-LABEL: store_load_vidx_sidx_offset:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 15
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_lshl_u32 v0, s0, v0, 2
-; GCN-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_endpgm
bb:
%alloca = alloca [32 x i32], align 4, addrspace(5)
%vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -3495,6 +4405,7 @@ bb:
%gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
store volatile i32 15, i32 addrspace(5)* %gep, align 4
%load = load volatile i32, i32 addrspace(5)* %gep, align 4
+ call void asm sideeffect "; use $0", "s"([32 x i32] addrspace(5)* %alloca) #0
ret void
}
@@ -3577,16 +4488,6 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_i64_aligned:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, 15
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, i64 addrspace(5)* %arg, align 8
%load = load volatile i64, i64 addrspace(5)* %arg, align 8
@@ -3672,16 +4573,6 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_i64_unaligned:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, 15
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, i64 addrspace(5)* %arg, align 1
%load = load volatile i64, i64 addrspace(5)* %arg, align 1
@@ -3774,17 +4665,6 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg)
; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_v3i32_unaligned:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, 1
-; GCN-NEXT: v_mov_b32_e32 v3, 2
-; GCN-NEXT: v_mov_b32_e32 v4, 3
-; GCN-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
%load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
@@ -3882,18 +4762,6 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg)
; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_v4i32_unaligned:
-; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, 1
-; GCN-NEXT: v_mov_b32_e32 v3, 2
-; GCN-NEXT: v_mov_b32_e32 v4, 3
-; GCN-NEXT: v_mov_b32_e32 v5, 4
-; GCN-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
%load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 33d1bd6b09a7e..8ceb5ec1ed8ab 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -374,7 +374,6 @@
; GCN-O1-NEXT: Machine Optimization Remark Emitter
; GCN-O1-NEXT: Shrink Wrapping analysis
; GCN-O1-NEXT: Prologue/Epilogue Insertion & Frame Finalization
-; GCN-O1-NEXT: Machine Late Instructions Cleanup Pass
; GCN-O1-NEXT: Control Flow Optimizer
; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-NEXT: Tail Duplication
@@ -671,7 +670,6 @@
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: Shrink Wrapping analysis
; GCN-O1-OPTS-NEXT: Prologue/Epilogue Insertion & Frame Finalization
-; GCN-O1-OPTS-NEXT: Machine Late Instructions Cleanup Pass
; GCN-O1-OPTS-NEXT: Control Flow Optimizer
; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Tail Duplication
@@ -970,7 +968,6 @@
; GCN-O2-NEXT: Machine Optimization Remark Emitter
; GCN-O2-NEXT: Shrink Wrapping analysis
; GCN-O2-NEXT: Prologue/Epilogue Insertion & Frame Finalization
-; GCN-O2-NEXT: Machine Late Instructions Cleanup Pass
; GCN-O2-NEXT: Control Flow Optimizer
; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O2-NEXT: Tail Duplication
@@ -1282,7 +1279,6 @@
; GCN-O3-NEXT: Machine Optimization Remark Emitter
; GCN-O3-NEXT: Shrink Wrapping analysis
; GCN-O3-NEXT: Prologue/Epilogue Insertion & Frame Finalization
-; GCN-O3-NEXT: Machine Late Instructions Cleanup Pass
; GCN-O3-NEXT: Control Flow Optimizer
; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O3-NEXT: Tail Duplication
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 463aacd8e28eb..d125f4304c91f 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -188,6 +188,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
; GCN-NEXT: ; %bb.3: ; %LeafBlock1
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_cmp_eq_u32 s8, 1
+; GCN-NEXT: s_mov_b64 s[4:5], -1
; GCN-NEXT: s_cbranch_scc0 .LBB1_5
; GCN-NEXT: ; %bb.4: ; %case1
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index fb070e8304919..50a8d7815b932 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -187,6 +187,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; SI-NEXT: s_branch .LBB3_3
; SI-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1
; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: s_mov_b64 s[12:13], -1
+; SI-NEXT: s_mov_b64 s[14:15], -1
; SI-NEXT: .LBB3_2: ; %Flow
; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1
; SI-NEXT: s_and_b64 vcc, exec, s[14:15]
@@ -204,6 +206,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; SI-NEXT: s_cbranch_vccz .LBB3_1
; SI-NEXT: ; %bb.5: ; %if.end
; SI-NEXT: ; in Loop: Header=BB3_3 Depth=1
+; SI-NEXT: s_mov_b64 s[14:15], -1
; SI-NEXT: s_mov_b64 vcc, s[6:7]
; SI-NEXT: s_cbranch_vccz .LBB3_7
; SI-NEXT: ; %bb.6: ; %if.else
@@ -260,6 +263,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; FLAT-NEXT: s_branch .LBB3_3
; FLAT-NEXT: .LBB3_1: ; in Loop: Header=BB3_3 Depth=1
; FLAT-NEXT: s_mov_b64 s[8:9], 0
+; FLAT-NEXT: s_mov_b64 s[12:13], -1
+; FLAT-NEXT: s_mov_b64 s[14:15], -1
; FLAT-NEXT: .LBB3_2: ; %Flow
; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1
; FLAT-NEXT: s_and_b64 vcc, exec, s[14:15]
@@ -277,6 +282,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; FLAT-NEXT: s_cbranch_vccz .LBB3_1
; FLAT-NEXT: ; %bb.5: ; %if.end
; FLAT-NEXT: ; in Loop: Header=BB3_3 Depth=1
+; FLAT-NEXT: s_mov_b64 s[14:15], -1
; FLAT-NEXT: s_mov_b64 vcc, s[6:7]
; FLAT-NEXT: s_cbranch_vccz .LBB3_7
; FLAT-NEXT: ; %bb.6: ; %if.else
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index d94fed4d12943..9af9894110c07 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -60,6 +60,7 @@ define amdgpu_kernel void @kernel(i32 %a, i32 addrspace(1)* %x, i32 noundef %n)
; CHECK-NEXT: s_cmp_lg_u32 s10, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB0_14
; CHECK-NEXT: ; %bb.3:
+; CHECK-NEXT: s_mov_b64 s[2:3], 0
; CHECK-NEXT: s_mov_b64 s[0:1], -1
; CHECK-NEXT: .LBB0_4: ; %Flow3
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
@@ -102,6 +103,7 @@ define amdgpu_kernel void @kernel(i32 %a, i32 addrspace(1)* %x, i32 noundef %n)
; CHECK-NEXT: s_branch .LBB0_10
; CHECK-NEXT: .LBB0_14: ; %cond.false.i8
; CHECK-NEXT: s_mov_b64 s[2:3], -1
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: s_trap 2
; CHECK-NEXT: s_branch .LBB0_4
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
index 56d7fc335911e..c3d3993a2736d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
@@ -140,6 +140,7 @@ define void @my_func(i32 %0) {
; GCN-NEXT: s_cbranch_scc1 .LBB0_10
; GCN-NEXT: ; %bb.9:
; GCN-NEXT: s_mov_b64 s[6:7], -1
+; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
; GCN-NEXT: s_cbranch_execnz .LBB0_3
; GCN-NEXT: s_branch .LBB0_4
@@ -172,6 +173,7 @@ define void @my_func(i32 %0) {
; GCN-NEXT: ; %bb.15: ; %LeafBlock9
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0
; GCN-NEXT: s_mov_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc
; GCN-NEXT: ; %bb.16: ; %do.body.i.i.i.i
; GCN-NEXT: s_mov_b64 s[4:5], exec
diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index b9b7a5d0f9a27..34bc7523051ff 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -34,6 +34,7 @@ define amdgpu_kernel void @test_inst_offset_kernel() {
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8
@@ -70,6 +71,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() {
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: s_mov_b32 s4, 0x40000
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
@@ -88,6 +90,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() {
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8
@@ -234,6 +237,7 @@ define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 {
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_movk_i32 s8, 0x1004
; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
@@ -316,6 +320,7 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
@@ -362,6 +367,7 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; MUBUF-NEXT: s_nop 0
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
@@ -385,6 +391,7 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index ab8efa9f21a0b..bdeb97cede4c3 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10551,6 +10551,7 @@ define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ;;#ASMEND
+; GFX6-NEXT: s_mov_b32 s2, 0x84800
; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
@@ -10795,7 +10796,7 @@ define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100
; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_nop 0
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload
@@ -11031,6 +11032,7 @@ define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v60
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v65
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v66
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v67
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 411335a98c758..253e17cdd303e 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -149,7 +149,6 @@
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Shrink Wrapping analysis
; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization
-; CHECK-NEXT: Machine Late Instructions Cleanup Pass
; CHECK-NEXT: Control Flow Optimizer
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Tail Duplication
diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
index fa94812509534..b5c63af5a348d 100644
--- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -1652,6 +1652,7 @@ define void @infiniteloop3() "frame-pointer"="all" {
; THUMB-ENABLE-NEXT: movs r0, #0
; THUMB-ENABLE-NEXT: cbnz r0, LBB11_5
; THUMB-ENABLE-NEXT: @ %bb.1: @ %loop2a.preheader
+; THUMB-ENABLE-NEXT: movs r0, #0
; THUMB-ENABLE-NEXT: movs r1, #0
; THUMB-ENABLE-NEXT: mov r2, r0
; THUMB-ENABLE-NEXT: b LBB11_3
@@ -1678,6 +1679,7 @@ define void @infiniteloop3() "frame-pointer"="all" {
; THUMB-DISABLE-NEXT: movs r0, #0
; THUMB-DISABLE-NEXT: cbnz r0, LBB11_5
; THUMB-DISABLE-NEXT: @ %bb.1: @ %loop2a.preheader
+; THUMB-DISABLE-NEXT: movs r0, #0
; THUMB-DISABLE-NEXT: movs r1, #0
; THUMB-DISABLE-NEXT: mov r2, r0
; THUMB-DISABLE-NEXT: b LBB11_3
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll
index 18fa1ad2f1323..48241424ac6e2 100644
--- a/llvm/test/CodeGen/ARM/fpclamptosat.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll
@@ -3764,6 +3764,7 @@ define i64 @stest_f32i64_mm(float %x) {
; SOFT-NEXT: @ %bb.18: @ %entry
; SOFT-NEXT: mov r3, r6
; SOFT-NEXT: .LBB48_19: @ %entry
+; SOFT-NEXT: ldr r0, .LCPI48_0
; SOFT-NEXT: cmp r4, r0
; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload
; SOFT-NEXT: beq .LBB48_21
@@ -4346,6 +4347,7 @@ define i64 @stest_f16i64_mm(half %x) {
; SOFT-NEXT: @ %bb.18: @ %entry
; SOFT-NEXT: mov r3, r6
; SOFT-NEXT: .LBB51_19: @ %entry
+; SOFT-NEXT: ldr r0, .LCPI51_0
; SOFT-NEXT: cmp r4, r0
; SOFT-NEXT: ldr r4, [sp, #16] @ 4-byte Reload
; SOFT-NEXT: beq .LBB51_21
diff --git a/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index 6e5db3ffa5c2b..af2009c7a2526 100644
--- a/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/llvm/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -22,7 +22,7 @@ entry:
; for.body -> for.cond.backedge (100%)
; -> cond.false.i (0%)
; CHECK: bb.1.for.body:
-; CHECK: successors: %bb.2(0x80000000), %bb.5(0x00000000)
+; CHECK: successors: %bb.2(0x80000000), %bb.4(0x00000000)
for.body:
br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1
diff --git a/llvm/test/CodeGen/ARM/jump-table-islands.ll b/llvm/test/CodeGen/ARM/jump-table-islands.ll
index c327affc04539..755ca30199ad1 100644
--- a/llvm/test/CodeGen/ARM/jump-table-islands.ll
+++ b/llvm/test/CodeGen/ARM/jump-table-islands.ll
@@ -1,6 +1,6 @@
; RUN: llc -mtriple=armv7-apple-ios8.0 -o - %s | FileCheck %s
-%BigInt = type i8500
+%BigInt = type i5500
define %BigInt @test_moved_jumptable(i1 %tst, i32 %sw, %BigInt %l) {
; CHECK-LABEL: test_moved_jumptable:
diff --git a/llvm/test/CodeGen/ARM/reg_sequence.ll b/llvm/test/CodeGen/ARM/reg_sequence.ll
index db620f65855cf..976dddc694d8f 100644
--- a/llvm/test/CodeGen/ARM/reg_sequence.ll
+++ b/llvm/test/CodeGen/ARM/reg_sequence.ll
@@ -283,6 +283,7 @@ define arm_aapcs_vfpcc i32 @t10(float %x) nounwind {
; CHECK-NEXT: vst1.32 {d17[1]}, [r0:32]
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: movne r0, #0
; CHECK-NEXT: bxne lr
; CHECK-NEXT: LBB9_1:
; CHECK-NEXT: trap
diff --git a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
index a1f8fdb28b203..a87c517705232 100644
--- a/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
+++ b/llvm/test/CodeGen/BPF/objdump_cond_op_2.ll
@@ -14,8 +14,9 @@ define i32 @test(i32, i32) local_unnamed_addr #0 {
; <label>:4: ; preds = %2
br label %5
-; CHECK: if r4 s>= r3 goto +10 <LBB0_2>
-; CHECK-LABEL: <LBB0_1>:
+; CHECK: if r4 s>= r3 goto +11 <LBB0_3>
+; CHECK: r0 = 0
+; CHECK-LABEL: <LBB0_2>:
; <label>:5: ; preds = %4, %5
%6 = phi i32 [ %9, %5 ], [ 0, %4 ]
@@ -27,12 +28,12 @@ define i32 @test(i32, i32) local_unnamed_addr #0 {
%12 = icmp slt i32 %10, %11
br i1 %12, label %5, label %13
; CHECK: r1 = r3
-; CHECK: if r2 s> r3 goto -10 <LBB0_1>
+; CHECK: if r2 s> r3 goto -10 <LBB0_2>
; <label>:13: ; preds = %5, %2
%14 = phi i32 [ 0, %2 ], [ %9, %5 ]
ret i32 %14
-; CHECK-LABEL: <LBB0_2>:
+; CHECK-LABEL: <LBB0_3>:
; CHECK: exit
}
attributes #0 = { norecurse nounwind readnone }
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index 9acdf25857117..ed2bfc9fcf600 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -841,6 +841,7 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: or16 $6, $4
; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload
; MMR3-NEXT: movn $1, $7, $4
+; MMR3-NEXT: li16 $7, 0
; MMR3-NEXT: movn $1, $6, $10
; MMR3-NEXT: lw $4, 24($sp) # 4-byte Folded Reload
; MMR3-NEXT: movz $1, $4, $16
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index b47a1f8f1a4e9..a8d829bef1d49 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -915,6 +915,7 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: or16 $5, $3
; MMR3-NEXT: lw $3, 12($sp) # 4-byte Folded Reload
; MMR3-NEXT: movn $8, $7, $3
+; MMR3-NEXT: li16 $7, 0
; MMR3-NEXT: movn $8, $5, $10
; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload
; MMR3-NEXT: movz $8, $3, $16
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 8c51f75504506..6b87f605dd8b7 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -182,7 +182,6 @@
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Shrink Wrapping analysis
; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization
-; CHECK-NEXT: Machine Late Instructions Cleanup Pass
; CHECK-NEXT: Control Flow Optimizer
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Tail Duplication
diff --git a/llvm/test/CodeGen/PowerPC/cgp-select.ll b/llvm/test/CodeGen/PowerPC/cgp-select.ll
index 2c36233b2b714..abc868c14cf85 100644
--- a/llvm/test/CodeGen/PowerPC/cgp-select.ll
+++ b/llvm/test/CodeGen/PowerPC/cgp-select.ll
@@ -11,6 +11,7 @@ define dso_local void @wibble(ptr nocapture readonly %arg, i32 signext %arg1, pt
; CHECK-NEXT: blt 0, .LBB0_5
; CHECK-NEXT: # %bb.1: # %bb6
; CHECK-NEXT: clrldi 4, 4, 32
+; CHECK-NEXT: li 7, 7
; CHECK-NEXT: addi 4, 4, -1
; CHECK-NEXT: mtctr 4
; CHECK-NEXT: li 4, 8
diff --git a/llvm/test/CodeGen/PowerPC/fast-isel-branch.ll b/llvm/test/CodeGen/PowerPC/fast-isel-branch.ll
index 4a424b673afc7..826ceead17118 100644
--- a/llvm/test/CodeGen/PowerPC/fast-isel-branch.ll
+++ b/llvm/test/CodeGen/PowerPC/fast-isel-branch.ll
@@ -53,7 +53,7 @@ define signext i32 @bar() #0 {
; AIX64-NEXT: L..BB0_1: # %for.cond
; AIX64-NEXT: #
; AIX64-NEXT: lwz 3, 120(1)
-; AIX64-NEXT: ld 4, L..C0(2)
+; AIX64-NEXT: ld 4, L..C0(2) # @x
; AIX64-NEXT: lwz 4, 0(4)
; AIX64-NEXT: cmpw 3, 4
; AIX64-NEXT: bge 0, L..BB0_4
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll
index 8390227508268..97f9ae172e769 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll
@@ -618,6 +618,7 @@ define zeroext i32 @ppcq_to_u32(ppc_fp128 %m) #0 {
; P8-NEXT: lfs f0, .LCPI13_0 at toc@l(r3)
; P8-NEXT: lis r3, -32768
; P8-NEXT: fcmpo cr0, f2, f3
+; P8-NEXT: xxlxor f3, f3, f3
; P8-NEXT: fcmpo cr1, f1, f0
; P8-NEXT: crand 4*cr5+lt, 4*cr1+eq, lt
; P8-NEXT: crandc 4*cr5+gt, 4*cr1+lt, 4*cr1+eq
@@ -659,6 +660,7 @@ define zeroext i32 @ppcq_to_u32(ppc_fp128 %m) #0 {
; P9-NEXT: lfs f0, .LCPI13_0 at toc@l(r3)
; P9-NEXT: fcmpo cr1, f2, f3
; P9-NEXT: lis r3, -32768
+; P9-NEXT: xxlxor f3, f3, f3
; P9-NEXT: fcmpo cr0, f1, f0
; P9-NEXT: crand 4*cr5+lt, eq, 4*cr1+lt
; P9-NEXT: crandc 4*cr5+gt, lt, eq
diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
index 9bc88c93ccb0a..bfb4f05c231b4 100644
--- a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
@@ -1295,6 +1295,7 @@ define i32 @test_fptoui_ppc_i32_ppc_fp128(ppc_fp128 %first) #0 {
; PC64LE-NEXT: lfs 0, .LCPI31_0 at toc@l(3)
; PC64LE-NEXT: lis 3, -32768
; PC64LE-NEXT: fcmpo 0, 2, 3
+; PC64LE-NEXT: xxlxor 3, 3, 3
; PC64LE-NEXT: fcmpo 1, 1, 0
; PC64LE-NEXT: crand 20, 6, 0
; PC64LE-NEXT: crandc 21, 4, 6
@@ -1332,6 +1333,7 @@ define i32 @test_fptoui_ppc_i32_ppc_fp128(ppc_fp128 %first) #0 {
; PC64LE9-NEXT: lfs 0, .LCPI31_0 at toc@l(3)
; PC64LE9-NEXT: fcmpo 1, 2, 3
; PC64LE9-NEXT: lis 3, -32768
+; PC64LE9-NEXT: xxlxor 3, 3, 3
; PC64LE9-NEXT: fcmpo 0, 1, 0
; PC64LE9-NEXT: crand 20, 2, 4
; PC64LE9-NEXT: crandc 21, 0, 2
diff --git a/llvm/test/CodeGen/SystemZ/frame-28.mir b/llvm/test/CodeGen/SystemZ/frame-28.mir
deleted file mode 100644
index dd5933a9c7b4b..0000000000000
--- a/llvm/test/CodeGen/SystemZ/frame-28.mir
+++ /dev/null
@@ -1,327 +0,0 @@
-# RUN: llc -mtriple=s390x-linux-gnu -start-before=prologepilog %s -o - -mcpu=z14 \
-# RUN: -verify-machineinstrs 2>&1 | FileCheck %s
-# REQUIRES: asserts
-#
-# Test that redundant frame addressing anchor points are removed by
-# MachineLateInstrsCleanup.
-
---- |
- define void @fun1() { ret void }
- define void @fun2() { ret void }
- define void @fun3() { ret void }
- define void @fun4() { ret void }
- define void @fun5() { ret void }
- define void @fun6() { ret void }
- define void @fun7() { ret void }
- define void @fun8() { ret void }
-
- declare i32 @foo()
-
- @ptr = external dso_local local_unnamed_addr global ptr
----
-
-# Test elimination of redundant LAYs in successor blocks.
-# CHECK-LABEL: fun1:
-# CHECK: lay %r1, 4096(%r15)
-# CHECK: # %bb.1:
-# CHECK-NOT: lay
-# CHECK: .LBB0_2:
-# CHECK-NOT: lay
----
-name: fun1
-tracksRegLiveness: true
-stack:
- - { id: 0, size: 5000 }
- - { id: 1, size: 2500 }
- - { id: 2, size: 2500 }
-
-machineFunctionInfo: {}
-body: |
- bb.0 (%ir-block.0):
- liveins: $f16d
- successors: %bb.2(0x00000001), %bb.1(0x7fffffff)
-
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.1, 0, $noreg
- CHIMux undef $r0l, 3, implicit-def $cc
- BRC 14, 8, %bb.2, implicit killed $cc
- J %bb.1
-
- bb.1:
- liveins: $f16d
- VST64 renamable $f16d, %stack.2, 0, $noreg
- J %bb.2
-
- bb.2:
- liveins: $f16d
- VST64 renamable $f16d, %stack.1, 0, $noreg
- Return
-...
-
-# In this function the LAY in bb.1 will have a
diff erent offset, so the first
-# LAY in bb.2 must remain.
-# CHECK-LABEL: fun2:
-# CHECK: lay %r1, 4096(%r15)
-# CHECK: # %bb.1:
-# CHECK: lay %r1, 8192(%r15)
-# CHECK: .LBB1_2:
-# CHECK: lay %r1, 4096(%r15)
-# CHECK-NOT: lay
----
-name: fun2
-tracksRegLiveness: true
-stack:
- - { id: 0, size: 5000 }
- - { id: 1, size: 5000 }
- - { id: 2, size: 2500 }
-
-machineFunctionInfo: {}
-body: |
- bb.0 (%ir-block.0):
- liveins: $f16d
- successors: %bb.2(0x00000001), %bb.1(0x7fffffff)
-
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.1, 0, $noreg
- CHIMux undef $r0l, 3, implicit-def $cc
- BRC 14, 8, %bb.2, implicit killed $cc
- J %bb.1
-
- bb.1:
- liveins: $f16d
- VST64 renamable $f16d, %stack.2, 0, $noreg
- J %bb.2
-
- bb.2:
- liveins: $f16d
- VST64 renamable $f16d, %stack.1, 0, $noreg
- VST64 renamable $f16d, %stack.1, 0, $noreg
- Return
-...
-
-# Test case with a loop (with room for improvement: since %r1 is not clobbered
-# inside the loop only the first LAY is needed).
-# CHECK-LABEL: fun3:
-# CHECK: lay %r1, 4096(%r15)
-# CHECK: .LBB2_1:
-# CHECK: lay %r1, 4096(%r15)
-# CHECK: .LBB2_2:
-# CHECK-NOT: lay %r1, 4096(%r15)
----
-name: fun3
-tracksRegLiveness: true
-stack:
- - { id: 0, size: 5000 }
- - { id: 1, size: 2500 }
- - { id: 2, size: 2500 }
-
-machineFunctionInfo: {}
-body: |
- bb.0 (%ir-block.0):
- liveins: $f16d
- successors: %bb.2(0x00000001), %bb.1(0x7fffffff)
-
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.1, 0, $noreg
- CHIMux undef $r0l, 3, implicit-def $cc
- BRC 14, 8, %bb.2, implicit killed $cc
- J %bb.1
-
- bb.1:
- liveins: $f16d
- successors: %bb.2(0x00000001), %bb.1(0x7fffffff)
-
- VST64 renamable $f16d, %stack.2, 0, $noreg
- CHIMux undef $r0l, 3, implicit-def $cc
- BRC 14, 8, %bb.1, implicit killed $cc
- J %bb.2
-
- bb.2:
- liveins: $f16d
- VST64 renamable $f16d, %stack.1, 0, $noreg
- Return
-...
-
-# Test case with a call which clobbers r1: the second LAY after the call is needed.
-# CHECK-LABEL: fun4:
-# CHECK: lay %r1, 4096(%r15)
-# CHECK: brasl
-# CHECK: lay %r1, 4096(%r15)
----
-name: fun4
-tracksRegLiveness: true
-stack:
- - { id: 0, size: 5000 }
- - { id: 1, size: 2500 }
-
-machineFunctionInfo: {}
-body: |
- bb.0 (%ir-block.0):
- liveins: $f16d
-
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.1, 0, $noreg
- ADJCALLSTACKDOWN 0, 0
- CallBRASL @foo, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def $r2l
- ADJCALLSTACKUP 0, 0
- $f17d = IMPLICIT_DEF
- VST64 renamable $f17d, %stack.1, 0, $noreg
- Return
-...
-
-# Test case where index reg is loaded instead of using an LAY. Only one LGHI is needed.
-# CHECK-LABEL: fun5:
-# CHECK: lghi %r1, 4096
-# CHECK-NOT: lghi
----
-name: fun5
-tracksRegLiveness: true
-stack:
- - { id: 0, size: 5000 }
- - { id: 1, size: 2500 }
-
-machineFunctionInfo: {}
-body: |
- bb.0 (%ir-block.0):
- liveins: $f16d
-
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- VST64 renamable $f16d, %stack.0, 0, $noreg
- $f0q = nofpexcept LXEB %stack.1, 0, $noreg, implicit $fpc
- $f1q = nofpexcept LXEB %stack.1, 0, $noreg, implicit $fpc
- Return
-...
-
-# Test where the constant is a Global. Only one LARL is needed.
-# CHECK-LABEL: fun6:
-# CHECK: larl %r1, ptr
-# CHECK-NOT: larl
----
-name: fun6
-alignment: 16
-tracksRegLiveness: true
-tracksDebugUserValues: true
-frameInfo:
- maxAlignment: 1
- maxCallFrameSize: 0
-fixedStack:
- - { id: 0, offset: -160, size: 8, alignment: 8 }
-machineFunctionInfo: {}
-body: |
- bb.0:
- successors: %bb.2(0x30000000), %bb.1(0x50000000)
-
- renamable $r1d = LARL @ptr
- CGHSI killed renamable $r1d, 0, 0, implicit-def $cc :: (volatile dereferenceable load (s64) from @ptr)
- BRC 14, 8, %bb.2, implicit killed $cc
- J %bb.1
-
- bb.1:
- renamable $r1d = LARL @ptr
- MVGHI killed renamable $r1d, 0, 0
-
- bb.2:
- Return
-
-...
-
-# Load of an invariant location (GOT). Only one LGRL is needed.
-# CHECK-LABEL: fun7:
-# CHECK: lgrl %r1, ptr
-# CHECK-NOT: lgrl
----
-name: fun7
-alignment: 16
-tracksRegLiveness: true
-tracksDebugUserValues: true
-frameInfo:
- maxAlignment: 1
- maxCallFrameSize: 0
-fixedStack:
- - { id: 0, offset: -160, size: 8, alignment: 8 }
-machineFunctionInfo: {}
-body: |
- bb.0:
- successors: %bb.2(0x30000000), %bb.1(0x50000000)
-
- renamable $r1d = LGRL @ptr :: (load (s64) from got)
- CGHSI killed renamable $r1d, 0, 0, implicit-def $cc :: (volatile dereferenceable load (s64) from @ptr)
- BRC 14, 8, %bb.2, implicit killed $cc
- J %bb.1
-
- bb.1:
- renamable $r1d = LGRL @ptr :: (load (s64) from got)
- MVGHI killed renamable $r1d, 0, 0
-
- bb.2:
- Return
-
-...
-
-# Load from constant pool. Only one LARL is needed.
-# CHECK-LABEL: fun8:
-# CHECK: larl %r1, .LCPI7_0
-# CHECK-NOT: larl
----
-name: fun8
-alignment: 16
-tracksRegLiveness: true
-tracksDebugUserValues: true
-liveins:
- - { reg: '$f0s' }
-frameInfo:
- maxAlignment: 1
- maxCallFrameSize: 0
-fixedStack:
- - { id: 0, offset: -160, size: 8, alignment: 8 }
-constants:
- - id: 0
- value: float 0x43E0000000000000
- alignment: 4
-machineFunctionInfo: {}
-body: |
- bb.0 (%ir-block.0):
- successors: %bb.1, %bb.2
- liveins: $f0s
-
- renamable $r1d = LARL %const.0
- renamable $f1s = LE killed renamable $r1d, 0, $noreg :: (load (s32) from constant-pool)
- nofpexcept CEBR renamable $f0s, renamable $f1s, implicit-def $cc, implicit $fpc
- BRC 15, 11, %bb.2, implicit killed $cc
-
- bb.1:
- liveins: $f0s
-
- J %bb.3
-
- bb.2 (%ir-block.0):
- liveins: $f0s, $f1s
-
- renamable $r1d = LARL %const.0
- renamable $f1s = LE killed renamable $r1d, 0, $noreg :: (load (s32) from constant-pool)
-
- bb.3 (%ir-block.0):
- liveins: $r2d
-
- Return
-
-...
diff --git a/llvm/test/CodeGen/Thumb/frame-access.ll b/llvm/test/CodeGen/Thumb/frame-access.ll
index e7d4aca9926d8..6a98d121464be 100644
--- a/llvm/test/CodeGen/Thumb/frame-access.ll
+++ b/llvm/test/CodeGen/Thumb/frame-access.ll
@@ -77,9 +77,12 @@ entry:
; CHECK-FP-ATPCS: adds r0, #8
; CHECK-FP-ATPCS: stm r0!, {r1, r2, r3}
; CHECK-FP-AAPCS: mov r0, r11
-; CHECK-FP-AAPCS: mov r7, r0
-; CHECK-FP-AAPCS: adds r7, #8
-; CHECK-FP-AAPCS: stm r7!, {r1, r2, r3}
+; CHECK-FP-AAPCS: str r1, [r0, #8]
+; CHECK-FP-AAPCS: mov r0, r11
+; CHECK-FP-AAPCS: str r2, [r0, #12]
+; CHECK-FP-AAPCS: mov r0, r11
+; CHECK-FP-AAPCS: str r3, [r0, #16]
+
; Re-aligned stack, access via FP
; int test_args_realign(int a, int b, int c, int d, int e) {
; __attribute__((aligned(16))) int v[4];
@@ -145,9 +148,11 @@ entry:
; CHECK-ATPCS-NEXT: adds r0, #8
; CHECK-ATPCS-NEXT: stm r0!, {r1, r2, r3}
; CHECK-AAPCS: mov r0, r11
-; CHECK-AAPCS: mov r7, r0
-; CHECK-AAPCS: adds r7, #8
-; CHECK-AAPCS: stm r7!, {r1, r2, r3}
+; CHECK-AAPCS: str r1, [r0, #8]
+; CHECK-AAPCS: mov r0, r11
+; CHECK-AAPCS: str r2, [r0, #12]
+; CHECK-AAPCS: mov r0, r11
+; CHECK-AAPCS: str r3, [r0, #16]
; VLAs present, access via FP
; int test_args_vla(int a, int b, int c, int d, int e) {
; int v[a];
@@ -303,9 +308,11 @@ entry:
; CHECK-FP-ATPCS-NEXT: adds r0, #8
; CHECK-FP-ATPCS-NEXT: stm r0!, {r1, r2, r3}
; CHECK-FP-AAPCS: mov r0, r11
-; CHECK-FP-AAPCS-NEXT: mov r5, r0
-; CHECK-FP-AAPCS-NEXT: adds r5, #8
-; CHECK-FP-AAPCS-NEXT: stm r5!, {r1, r2, r3}
+; CHECK-FP-AAPCS-NEXT: str r1, [r0, #8]
+; CHECK-FP-AAPCS-NEXT: mov r0, r11
+; CHECK-FP-AAPCS-NEXT: str r2, [r0, #12]
+; CHECK-FP-AAPCS-NEXT: mov r0, r11
+; CHECK-FP-AAPCS-NEXT: str r3, [r0, #16]
; struct S { int x[128]; } s;
; int test(S a, int b) {
diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
index d3297d2a18089..7929bba0638a5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll
@@ -1890,6 +1890,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
; CHECK-NEXT: cmp.w r2, #-1
; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: csel r5, r5, r8, gt
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: csel r5, r6, r5, eq
@@ -2151,6 +2152,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
; CHECK-NEXT: cmp.w r2, #-1
; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: csel r5, r5, r8, gt
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: csel r5, r6, r5, eq
@@ -2408,6 +2410,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
; CHECK-NEXT: cmp.w r2, #-1
; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: csel r5, r5, r8, gt
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: csel r5, r6, r5, eq
diff --git a/llvm/test/CodeGen/X86/2008-04-09-BranchFolding.ll b/llvm/test/CodeGen/X86/2008-04-09-BranchFolding.ll
index 45b5ce562d805..878af85c51c1a 100644
--- a/llvm/test/CodeGen/X86/2008-04-09-BranchFolding.ll
+++ b/llvm/test/CodeGen/X86/2008-04-09-BranchFolding.ll
@@ -18,6 +18,7 @@ define fastcc ptr @pushdecl(ptr %x) nounwind {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB0_1: # %bb160
+; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll b/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
index b32afdc2214e0..c5e1ff6689220 100644
--- a/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
@@ -27,6 +27,7 @@ define i16 @SQLDriversW(ptr %henv, i16 zeroext %fDir, ptr %szDrvDesc, i16 signe
; CHECK-NEXT: jne LBB0_6
; CHECK-NEXT: ## %bb.4: ## %bb37
; CHECK-NEXT: movw $0, 40(%edi)
+; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: leal (,%ecx,4), %ecx
; CHECK-NEXT: leal (,%ebx,4), %edx
diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
index ae0be9b5a5bcd..c67208dcf44fa 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@@ -58,6 +58,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
; CHECK-NEXT: movabsq $64, %rcx
; CHECK-NEXT: tileloadd 1088(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
+; CHECK-NEXT: movabsq $64, %rcx
; CHECK-NEXT: tileloadd 64(%rsp,%rcx), %tmm2 # 1024-byte Folded Reload
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tilestored %tmm0, (%rax,%r14)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
index 9f0d8aee3c4ee..e4a2279f4675f 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
@@ -46,6 +46,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm5, 1088(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm5
+; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm5, 64(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
@@ -63,6 +64,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm5, 1088(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: tdpbssd %tmm3, %tmm2, %tmm5
+; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm5, 64(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/fast-isel-stackcheck.ll b/llvm/test/CodeGen/X86/fast-isel-stackcheck.ll
index 10e192e385018..1398b3006699d 100644
--- a/llvm/test/CodeGen/X86/fast-isel-stackcheck.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-stackcheck.ll
@@ -8,6 +8,7 @@ target triple = "x86_64-apple-macosx"
; CHECK-LABEL: foo:
; CHECK: movq ___stack_chk_guard at GOTPCREL(%rip), %rax
+; CHECK: movq ___stack_chk_guard at GOTPCREL(%rip), %rax
define void @foo() #0 {
entry:
%_tags = alloca [3 x i32], align 4
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index bf561a19b6874..eaf794c787688 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -338,24 +338,26 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
; X86-SLOW-NEXT: movl %edx, %ebx
; X86-SLOW-NEXT: movl %esi, %edx
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT: testb $32, %al
-; X86-SLOW-NEXT: je .LBB6_5
-; X86-SLOW-NEXT: .LBB6_4:
-; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT: movl %ebp, %esi
-; X86-SLOW-NEXT: movl %edx, %ebp
-; X86-SLOW-NEXT: movl %ecx, %edx
-; X86-SLOW-NEXT: jmp .LBB6_6
+; X86-SLOW-NEXT: jmp .LBB6_3
; X86-SLOW-NEXT: .LBB6_1:
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT: .LBB6_3:
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: testb $32, %al
; X86-SLOW-NEXT: jne .LBB6_4
-; X86-SLOW-NEXT: .LBB6_5:
+; X86-SLOW-NEXT: # %bb.5:
; X86-SLOW-NEXT: movl %ecx, %ebx
; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: jmp .LBB6_6
+; X86-SLOW-NEXT: .LBB6_4:
+; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: movl %ebp, %esi
+; X86-SLOW-NEXT: movl %edx, %ebp
+; X86-SLOW-NEXT: movl %ecx, %edx
; X86-SLOW-NEXT: .LBB6_6:
; X86-SLOW-NEXT: movl %edx, %edi
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %ecx
; X86-SLOW-NEXT: shll %cl, %edi
; X86-SLOW-NEXT: shrl %ebx
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index e226279164a01..7bdc34b224cda 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -247,6 +247,7 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) {
; SSE-NEXT: retq
; SSE-NEXT: LBB3_1: ## %cond.load
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je LBB3_4
; SSE-NEXT: LBB3_3: ## %cond.load1
@@ -1128,6 +1129,7 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
; SSE2-NEXT: retq
; SSE2-NEXT: LBB10_1: ## %cond.load
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB10_4
; SSE2-NEXT: LBB10_3: ## %cond.load1
@@ -1207,6 +1209,7 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
; SSE42-NEXT: retq
; SSE42-NEXT: LBB10_1: ## %cond.load
; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE42-NEXT: xorps %xmm1, %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB10_4
; SSE42-NEXT: LBB10_3: ## %cond.load1
@@ -2647,6 +2650,7 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
; SSE2-NEXT: retq
; SSE2-NEXT: LBB20_1: ## %cond.load
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB20_4
; SSE2-NEXT: LBB20_3: ## %cond.load1
@@ -2726,6 +2730,7 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
; SSE42-NEXT: retq
; SSE42-NEXT: LBB20_1: ## %cond.load
; SSE42-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB20_4
; SSE42-NEXT: LBB20_3: ## %cond.load1
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index f8e9bf3086cab..c28f30f2a834b 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2231,6 +2231,7 @@ define <16 x i32> @splat_v3i32(ptr %ptr) {
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1]
+; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: xorps %xmm3, %xmm3
; SSE42-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 24f544512b8b1..9de77d2f4c3c8 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -170,7 +170,6 @@
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Shrink Wrapping analysis
; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization
-; CHECK-NEXT: Machine Late Instructions Cleanup Pass
; CHECK-NEXT: Control Flow Optimizer
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Tail Duplication
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 6bec36638720d..78e47057546b3 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -1240,6 +1240,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: cmovsl %esi, %eax
+; X86-NEXT: movl $0, %esi
; X86-NEXT: movl $-1, %ebx
; X86-NEXT: cmovsl %ebx, %edi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 10064772a8567..8fb34ff811875 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -533,6 +533,7 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: .LBB6_9: # %entry
; i686-NEXT: movl %edi, %esi
; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: shrl %cl, %ebp
; i686-NEXT: testb $32, %cl
@@ -845,6 +846,7 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
; i686-NEXT: movb $64, %cl
; i686-NEXT: subb %dl, %cl
+; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
; i686-NEXT: shldl %cl, %ebx, %ebp
; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index a73fa32e21013..5246c8b14b80c 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -354,6 +354,7 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-NEXT: cmpw %si, %dx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: cmovnel %eax, %ebx
+; X86-NEXT: movl $65535, %eax # imm = 0xFFFF
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: shll %cl, %esi
diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll
index e753019593d80..dd0a50e6ba984 100644
--- a/llvm/test/CodeGen/X86/vec_extract.ll
+++ b/llvm/test/CodeGen/X86/vec_extract.ll
@@ -111,12 +111,14 @@ define <4 x i32> @ossfuzz15662(ptr %in) {
; X32: # %bb.0:
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: ossfuzz15662:
; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, (%rax)
+; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: retq
%C10 = icmp ule i1 false, false
%C3 = icmp ule i1 true, undef
diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll
index b7cfdeb7aa5a3..ab16e1a60946c 100644
--- a/llvm/test/CodeGen/X86/vec_shift5.ll
+++ b/llvm/test/CodeGen/X86/vec_shift5.ll
@@ -178,12 +178,14 @@ define <4 x i32> @test17(<4 x i32> %a0, ptr %dummy) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorps %xmm0, %xmm0
; X86-NEXT: movaps %xmm0, (%eax)
+; X86-NEXT: xorps %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test17:
; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: retq
%a = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 6)
store <4 x i32> %a, ptr %dummy
@@ -197,12 +199,14 @@ define <4 x i32> @test18(<4 x i32> %a0, ptr %dummy) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorps %xmm0, %xmm0
; X86-NEXT: movaps %xmm0, (%eax)
+; X86-NEXT: xorps %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test18:
; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: retq
%a = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 3)
store <4 x i32> %a, ptr %dummy
diff --git a/llvm/test/CodeGen/XCore/scavenging.ll b/llvm/test/CodeGen/XCore/scavenging.ll
index dc126984def7c..adf313e4a9890 100644
--- a/llvm/test/CodeGen/XCore/scavenging.ll
+++ b/llvm/test/CodeGen/XCore/scavenging.ll
@@ -87,10 +87,13 @@ declare void @g(i32*, i32*)
; CHECK: ldaw r0, sp[0]
; CHECK: ldw r5, cp[[[INDEX1]]]
; CHECK: stw r1, r0[r5]
+; CHECK: ldaw r0, sp[0]
; CHECK: ldw r1, cp[[[INDEX2]]]
; CHECK: stw r2, r0[r1]
+; CHECK: ldaw r0, sp[0]
; CHECK: ldw r1, cp[[[INDEX3]]]
; CHECK: stw r3, r0[r1]
+; CHECK: ldaw r0, sp[0]
; CHECK: ldw r1, cp[[[INDEX4]]]
; CHECK: stw r11, r0[r1]
; CHECK: ldaw sp, sp[65535]
More information about the llvm-commits
mailing list