[llvm] c649fd3 - [MachineSink][AArch64] Sink instruction copies when they can replace copy into hard register or folded into addressing mode
Momchil Velikov via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 25 02:50:11 PDT 2023
Author: Momchil Velikov
Date: 2023-09-25T10:49:44+01:00
New Revision: c649fd34e928ad01951cbff298c5c44853dd41dd
URL: https://github.com/llvm/llvm-project/commit/c649fd34e928ad01951cbff298c5c44853dd41dd
DIFF: https://github.com/llvm/llvm-project/commit/c649fd34e928ad01951cbff298c5c44853dd41dd.diff
LOG: [MachineSink][AArch64] Sink instruction copies when they can replace copy into hard register or folded into addressing mode
This patch adds a new code transformation to the `MachineSink` pass,
that tries to sink copies of an instruction, when the copies can be folded
into the addressing modes of load/store instructions, or
replace another instruction (currently, copies into a hard register).
The criteria for performing the transformation is that:
* the register pressure at the sink destination block must not
exceed the register pressure limits
* the latency and throughput of the load/store or the copy must not deteriorate
* the original instruction must be deleted
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D152828
Added:
Modified:
llvm/include/llvm/CodeGen/TargetInstrInfo.h
llvm/include/llvm/CodeGen/TargetPassConfig.h
llvm/lib/CodeGen/ImplicitNullChecks.cpp
llvm/lib/CodeGen/MachineSink.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
llvm/lib/Target/AArch64/AArch64InstrInfo.h
llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll
llvm/test/CodeGen/AArch64/align-down.ll
llvm/test/CodeGen/AArch64/and-mask-removal.ll
llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
llvm/test/CodeGen/AArch64/arm64-long-shift.ll
llvm/test/CodeGen/AArch64/arm64-stp.ll
llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
llvm/test/CodeGen/AArch64/atomic-ops.ll
llvm/test/CodeGen/AArch64/cmp-select-sign.ll
llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
llvm/test/CodeGen/AArch64/loop-sink.mir
llvm/test/CodeGen/AArch64/nontemporal-load.ll
llvm/test/CodeGen/AArch64/optimize-imm.ll
llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
llvm/test/CodeGen/AArch64/rand.ll
llvm/test/CodeGen/AArch64/shrink-constant-multiple-users.ll
llvm/test/CodeGen/AArch64/sink-and-fold.ll
llvm/test/CodeGen/AArch64/swift-async-win.ll
llvm/test/CodeGen/AArch64/swift-async.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 04859a50d6fdeb4..98679b4dcf3cbfb 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -85,11 +85,21 @@ struct RegImmPair {
/// Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
/// It holds the register values, the scale value and the displacement.
+/// It also holds a descriptor for the expression used to calculate the address
+/// from the operands.
struct ExtAddrMode {
+ enum class Formula {
+ Basic = 0, // BaseReg + ScaledReg * Scale + Displacement
+ SExtScaledReg = 1, // BaseReg + sext(ScaledReg) * Scale + Displacement
+ ZExtScaledReg = 2 // BaseReg + zext(ScaledReg) * Scale + Displacement
+ };
+
Register BaseReg;
Register ScaledReg;
- int64_t Scale;
- int64_t Displacement;
+ int64_t Scale = 0;
+ int64_t Displacement = 0;
+ Formula Form = Formula::Basic;
+ ExtAddrMode() = default;
};
//---------------------------------------------------------------------------
@@ -1436,6 +1446,26 @@ class TargetInstrInfo : public MCInstrInfo {
return std::nullopt;
}
+ /// Check if it's possible and beneficial to fold the addressing computation
+ /// `AddrI` into the addressing mode of the load/store instruction `MemI`. The
+ /// memory instruction is a user of the virtual register `Reg`, which in turn
+ /// is the ultimate destination of zero or more COPY instructions from the
+ /// output register of `AddrI`.
+ /// Return the adddressing mode after folding in `AM`.
+ virtual bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
+ const MachineInstr &AddrI,
+ ExtAddrMode &AM) const {
+ return false;
+ }
+
+ /// Emit a load/store instruction with the same value register as `MemI`, but
+ /// using the address from `AM`. The addressing mode must have been obtained
+ /// from `canFoldIntoAddr` for the same memory instruction.
+ virtual MachineInstr *emitLdStWithAddr(MachineInstr &MemI,
+ const ExtAddrMode &AM) const {
+ llvm_unreachable("target did not implement emitLdStWithAddr()");
+ }
+
/// Returns true if MI's Def is NullValueReg, and the MI
/// does not change the Zero value. i.e. cases such as rax = shr rax, X where
/// NullValueReg = rax. Note that if the NullValueReg is non-zero, this
diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index dac327899e33f61..66365419aa330be 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -130,6 +130,11 @@ class TargetPassConfig : public ImmutablePass {
/// Default setting for -enable-tail-merge on this target.
bool EnableTailMerge = true;
+ /// Enable sinking of instructions in MachineSink where a computation can be
+ /// folded into the addressing mode of a memory load/store instruction or
+ /// replace a copy.
+ bool EnableSinkAndFold = false;
+
/// Require processing of functions such that callees are generated before
/// callers.
bool RequireCodeGenSCCOrder = false;
@@ -176,6 +181,9 @@ class TargetPassConfig : public ImmutablePass {
bool getEnableTailMerge() const { return EnableTailMerge; }
void setEnableTailMerge(bool Enable) { setOpt(EnableTailMerge, Enable); }
+ bool getEnableSinkAndFold() const { return EnableSinkAndFold; }
+ void setEnableSinkAndFold(bool Enable) { setOpt(EnableSinkAndFold, Enable); }
+
bool requiresCodeGenSCCOrder() const { return RequireCodeGenSCCOrder; }
void setRequiresCodeGenSCCOrder(bool Enable = true) {
setOpt(RequireCodeGenSCCOrder, Enable);
diff --git a/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
index b2a7aad734115d7..5ad003ed3180207 100644
--- a/llvm/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -372,7 +372,7 @@ ImplicitNullChecks::isSuitableMemoryOp(const MachineInstr &MI,
if (!MI.mayLoadOrStore() || MI.isPredicable())
return SR_Unsuitable;
auto AM = TII->getAddrModeFromMemoryOp(MI, TRI);
- if (!AM)
+ if (!AM || AM->Form != ExtAddrMode::Formula::Basic)
return SR_Unsuitable;
auto AddrMode = *AM;
const Register BaseReg = AddrMode.BaseReg, ScaledReg = AddrMode.ScaledReg;
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index b4cbb93d758ef2f..480ac23d43ad879 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -41,6 +41,7 @@
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/BasicBlock.h"
@@ -115,6 +116,7 @@ STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");
namespace {
class MachineSinking : public MachineFunctionPass {
+ const TargetSubtargetInfo *STI = nullptr;
const TargetInstrInfo *TII = nullptr;
const TargetRegisterInfo *TRI = nullptr;
MachineRegisterInfo *MRI = nullptr; // Machine register information
@@ -165,7 +167,10 @@ namespace {
StoreInstrCache;
/// Cached BB's register pressure.
- std::map<MachineBasicBlock *, std::vector<unsigned>> CachedRegisterPressure;
+ std::map<const MachineBasicBlock *, std::vector<unsigned>>
+ CachedRegisterPressure;
+
+ bool EnableSinkAndFold;
public:
static char ID; // Pass identification
@@ -187,6 +192,7 @@ namespace {
AU.addPreserved<MachineLoopInfo>();
if (UseBlockFreqInfo)
AU.addRequired<MachineBlockFrequencyInfo>();
+ AU.addRequired<TargetPassConfig>();
}
void releaseMemory() override {
@@ -246,11 +252,17 @@ namespace {
bool PerformTrivialForwardCoalescing(MachineInstr &MI,
MachineBasicBlock *MBB);
+ bool PerformSinkAndFold(MachineInstr &MI, MachineBasicBlock *MBB);
+
SmallVector<MachineBasicBlock *, 4> &
GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
AllSuccsCache &AllSuccessors) const;
- std::vector<unsigned> &getBBRegisterPressure(MachineBasicBlock &MBB);
+ std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB);
+
+ bool registerPressureSetExceedsLimit(unsigned NRegs,
+ const TargetRegisterClass *RC,
+ const MachineBasicBlock &MBB);
};
} // end anonymous namespace
@@ -338,6 +350,224 @@ bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
return true;
}
+bool MachineSinking::PerformSinkAndFold(MachineInstr &MI,
+ MachineBasicBlock *MBB) {
+ if (MI.isCopy() || MI.mayLoadOrStore() ||
+ MI.getOpcode() == TargetOpcode::REG_SEQUENCE)
+ return false;
+
+ // Don't sink instructions that the target prefers not to sink.
+ if (!TII->shouldSink(MI))
+ return false;
+
+ // Check if it's safe to move the instruction.
+ bool SawStore = true;
+ if (!MI.isSafeToMove(AA, SawStore))
+ return false;
+
+ // Convergent operations may not be made control-dependent on additional
+ // values.
+ if (MI.isConvergent())
+ return false;
+
+ // Don't sink defs/uses of hard registers or if the instruction defines more
+ // than one register.
+ // Don't sink more than two register uses - it'll cover most of the cases and
+ // greatly simplifies the register pressure checks.
+ Register DefReg;
+ Register UsedRegA, UsedRegB;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (MO.isImm() || MO.isRegMask() || MO.isRegLiveOut() || MO.isMetadata() ||
+ MO.isMCSymbol() || MO.isDbgInstrRef() || MO.isCFIIndex() ||
+ MO.isIntrinsicID() || MO.isPredicate() || MO.isShuffleMask())
+ continue;
+ if (!MO.isReg())
+ return false;
+
+ Register Reg = MO.getReg();
+ if (Reg == 0)
+ continue;
+
+ if (Reg.isVirtual()) {
+ if (MO.isDef()) {
+ if (DefReg)
+ return false;
+ DefReg = Reg;
+ continue;
+ }
+
+ if (UsedRegA == 0)
+ UsedRegA = Reg;
+ else if (UsedRegB == 0)
+ UsedRegB = Reg;
+ else
+ return false;
+ continue;
+ }
+
+ if (Reg.isPhysical() &&
+ (MRI->isConstantPhysReg(Reg) || TII->isIgnorableUse(MO)))
+ continue;
+
+ return false;
+ }
+
+ // Scan uses of the destination register. Every use, except the last, must be
+ // a copy, with a chain of copies terminating with either a copy into a hard
+ // register, or a load/store instruction where the use is part of the
+ // address (*not* the stored value).
+ using SinkInfo = std::pair<MachineInstr *, ExtAddrMode>;
+ SmallVector<SinkInfo> SinkInto;
+ SmallVector<Register> Worklist;
+
+ const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+ const TargetRegisterClass *RCA =
+ UsedRegA == 0 ? nullptr : MRI->getRegClass(UsedRegA);
+ const TargetRegisterClass *RCB =
+ UsedRegB == 0 ? nullptr : MRI->getRegClass(UsedRegB);
+
+ Worklist.push_back(DefReg);
+ while (!Worklist.empty()) {
+ Register Reg = Worklist.pop_back_val();
+
+ for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+ ExtAddrMode MaybeAM;
+ MachineInstr &UseInst = *MO.getParent();
+ if (UseInst.isCopy()) {
+ Register DstReg;
+ if (const MachineOperand &O = UseInst.getOperand(0); O.isReg())
+ DstReg = O.getReg();
+ if (DstReg == 0)
+ return false;
+ if (DstReg.isVirtual()) {
+ Worklist.push_back(DstReg);
+ continue;
+ }
+ // If we are going to replace a copy, the original instruction must be
+ // as cheap as a copy.
+ if (!TII->isAsCheapAsAMove(MI))
+ return false;
+ // The hard register must be in the register class of the original
+ // instruction's destination register.
+ if (!RC->contains(DstReg))
+ return false;
+ } else if (UseInst.mayLoadOrStore()) {
+ ExtAddrMode AM;
+ if (!TII->canFoldIntoAddrMode(UseInst, Reg, MI, AM))
+ return false;
+ MaybeAM = AM;
+ } else {
+ return false;
+ }
+
+ if (UseInst.getParent() != MI.getParent()) {
+ // If the register class of the register we are replacingis a superset
+ // of any of the register classes of the operands of the materialized
+ // instruction don't consider that live range extended.
+ const TargetRegisterClass *RCS = MRI->getRegClass(Reg);
+ if (RCA && RCA->hasSuperClassEq(RCS))
+ RCA = nullptr;
+ else if (RCB && RCB->hasSuperClassEq(RCS))
+ RCB = nullptr;
+ if (RCA || RCB) {
+ if (RCA == nullptr) {
+ RCA = RCB;
+ RCB = nullptr;
+ }
+
+ unsigned NRegs = !!RCA + !!RCB;
+ if (RCA == RCB)
+ RCB = nullptr;
+
+ // Check we don't exceed register pressure at the destination.
+ const MachineBasicBlock &MBB = *UseInst.getParent();
+ if (RCB == nullptr) {
+ if (registerPressureSetExceedsLimit(NRegs, RCA, MBB))
+ return false;
+ } else if (registerPressureSetExceedsLimit(1, RCA, MBB) ||
+ registerPressureSetExceedsLimit(1, RCB, MBB)) {
+ return false;
+ }
+ }
+ }
+
+ SinkInto.emplace_back(&UseInst, MaybeAM);
+ }
+ }
+
+ if (SinkInto.empty())
+ return false;
+
+ // Now we know we can fold the instruction in all its users.
+ if (UsedRegA)
+ MRI->clearKillFlags(UsedRegA);
+ if (UsedRegB)
+ MRI->clearKillFlags(UsedRegB);
+
+ for (auto &[SinkDst, MaybeAM] : SinkInto) {
+ MachineInstr *New = nullptr;
+ LLVM_DEBUG(dbgs() << "Sinking copy of"; MI.dump(); dbgs() << "into";
+ SinkDst->dump());
+ if (SinkDst->isCopy()) {
+ // Sink a copy of the instruction, replacing a COPY instruction.
+ MachineBasicBlock::iterator InsertPt = SinkDst->getIterator();
+ Register DstReg = SinkDst->getOperand(0).getReg();
+ TII->reMaterialize(*SinkDst->getParent(), InsertPt, DstReg, 0, MI, *TRI);
+ // If the original instruction did not have source location, reuse a one
+ // from the COPY.
+ New = &*std::prev(InsertPt);
+ if (const DebugLoc &NewLoc = New->getDebugLoc(); !NewLoc)
+ New->setDebugLoc(SinkDst->getDebugLoc());
+ // Sink DBG_VALUEs, which refer to the original instruction's destination
+ // (DefReg).
+ MachineBasicBlock &SinkMBB = *SinkDst->getParent();
+ auto &DbgUsers = SeenDbgUsers[DefReg];
+ for (auto &U : DbgUsers) {
+ MachineInstr *DbgMI = U.getPointer();
+ if (U.getInt())
+ continue;
+ MachineInstr *NewDbgMI = SinkDst->getMF()->CloneMachineInstr(DbgMI);
+ NewDbgMI->getOperand(0).setReg(DstReg);
+ SinkMBB.insertAfter(InsertPt, NewDbgMI);
+ }
+ } else {
+ // Fold instruction into the addressing mode of a memory instruction.
+ New = TII->emitLdStWithAddr(*SinkDst, MaybeAM);
+ }
+ LLVM_DEBUG(dbgs() << "yielding"; New->dump());
+ SinkDst->eraseFromParent();
+ }
+
+ MI.eraseFromParent();
+
+ // Collect instructions that need to be deleted (COPYs). We cannot delete them
+ // while traversing register uses.
+ SmallVector<MachineInstr *> CleanupInstrs;
+ Worklist.push_back(DefReg);
+ while (!Worklist.empty()) {
+ Register Reg = Worklist.pop_back_val();
+
+ for (MachineOperand &MO : MRI->use_operands(Reg)) {
+ MachineInstr *U = MO.getParent();
+ assert((U->isCopy() || U->isDebugInstr()) &&
+ "Only debug uses and copies must remain");
+ if (U->isCopy()) {
+ Worklist.push_back(U->getOperand(0).getReg());
+ CleanupInstrs.push_back(U);
+ } else {
+ MO.setReg(0);
+ MO.setSubReg(0);
+ }
+ }
+ }
+
+ // Delete the dead COPYs.
+ for (MachineInstr *Del : CleanupInstrs)
+ Del->eraseFromParent();
+
+ return true;
+}
+
/// AllUsesDominatedByBlock - Return true if all uses of the specified register
/// occur in blocks dominated by the specified block. If any use is in the
/// definition block, then return false since it is never legal to move def
@@ -461,8 +691,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "******** Machine Sinking ********\n");
- TII = MF.getSubtarget().getInstrInfo();
- TRI = MF.getSubtarget().getRegisterInfo();
+ STI = &MF.getSubtarget();
+ TII = STI->getInstrInfo();
+ TRI = STI->getRegisterInfo();
MRI = &MF.getRegInfo();
DT = &getAnalysis<MachineDominatorTree>();
PDT = &getAnalysis<MachinePostDominatorTree>();
@@ -471,6 +702,8 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
RegClassInfo.runOnMachineFunction(MF);
+ TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+ EnableSinkAndFold = PassConfig->getEnableSinkAndFold();
bool EverMadeChange = false;
@@ -547,8 +780,8 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
}
bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
- // Can't sink anything out of a block that has less than two successors.
- if (MBB.succ_size() <= 1 || MBB.empty()) return false;
+ if ((!EnableSinkAndFold && MBB.succ_size() <= 1) || MBB.empty())
+ return false;
// Don't bother sinking code out of unreachable blocks. In addition to being
// unprofitable, it can also lead to infinite looping, because in an
@@ -579,8 +812,16 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
continue;
}
- bool Joined = PerformTrivialForwardCoalescing(MI, &MBB);
- if (Joined) {
+ if (EnableSinkAndFold && PerformSinkAndFold(MI, &MBB)) {
+ MadeChange = true;
+ continue;
+ }
+
+ // Can't sink anything out of a block that has less than two successors.
+ if (MBB.succ_size() <= 1)
+ continue;
+
+ if (PerformTrivialForwardCoalescing(MI, &MBB)) {
MadeChange = true;
continue;
}
@@ -597,7 +838,6 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
SeenDbgVars.clear();
// recalculate the bb register pressure after sinking one BB.
CachedRegisterPressure.clear();
-
return MadeChange;
}
@@ -737,7 +977,7 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
}
std::vector<unsigned> &
-MachineSinking::getBBRegisterPressure(MachineBasicBlock &MBB) {
+MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
// Currently to save compiling time, MBB's register pressure will not change
// in one ProcessBlock iteration because of CachedRegisterPressure. but MBB's
// register pressure is changed after sinking any instructions into it.
@@ -753,10 +993,10 @@ MachineSinking::getBBRegisterPressure(MachineBasicBlock &MBB) {
RPTracker.init(MBB.getParent(), &RegClassInfo, nullptr, &MBB, MBB.end(),
/*TrackLaneMasks*/ false, /*TrackUntiedDefs=*/true);
- for (MachineBasicBlock::iterator MII = MBB.instr_end(),
- MIE = MBB.instr_begin();
+ for (MachineBasicBlock::const_iterator MII = MBB.instr_end(),
+ MIE = MBB.instr_begin();
MII != MIE; --MII) {
- MachineInstr &MI = *std::prev(MII);
+ const MachineInstr &MI = *std::prev(MII);
if (MI.isDebugInstr() || MI.isPseudoProbe())
continue;
RegisterOperands RegOpers;
@@ -772,6 +1012,19 @@ MachineSinking::getBBRegisterPressure(MachineBasicBlock &MBB) {
return It.first->second;
}
+bool MachineSinking::registerPressureSetExceedsLimit(
+ unsigned NRegs, const TargetRegisterClass *RC,
+ const MachineBasicBlock &MBB) {
+ unsigned Weight = NRegs * TRI->getRegClassWeight(RC).RegWeight;
+ const int *PS = TRI->getRegClassPressureSets(RC);
+ std::vector<unsigned> BBRegisterPressure = getBBRegisterPressure(MBB);
+ for (; *PS != -1; PS++)
+ if (Weight + BBRegisterPressure[*PS] >=
+ TRI->getRegPressureSetLimit(*MBB.getParent(), *PS))
+ return true;
+ return false;
+}
+
/// isProfitableToSinkTo - Return true if it is profitable to sink MI.
bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
@@ -816,21 +1069,6 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
if (!MCycle)
return false;
- auto isRegisterPressureSetExceedLimit = [&](const TargetRegisterClass *RC) {
- unsigned Weight = TRI->getRegClassWeight(RC).RegWeight;
- const int *PS = TRI->getRegClassPressureSets(RC);
- // Get register pressure for block SuccToSinkTo.
- std::vector<unsigned> BBRegisterPressure =
- getBBRegisterPressure(*SuccToSinkTo);
- for (; *PS != -1; PS++)
- // check if any register pressure set exceeds limit in block SuccToSinkTo
- // after sinking.
- if (Weight + BBRegisterPressure[*PS] >=
- TRI->getRegPressureSetLimit(*MBB->getParent(), *PS))
- return true;
- return false;
- };
-
// If this instruction is inside a Cycle and sinking this instruction can make
// more registers live range shorten, it is still prifitable.
for (const MachineOperand &MO : MI.operands()) {
@@ -870,7 +1108,8 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
// The DefMI is defined inside the cycle.
// If sinking this operand makes some register pressure set exceed limit,
// it is not profitable.
- if (isRegisterPressureSetExceedLimit(MRI->getRegClass(Reg))) {
+ if (registerPressureSetExceedsLimit(1, MRI->getRegClass(Reg),
+ *SuccToSinkTo)) {
LLVM_DEBUG(dbgs() << "register pressure exceed limit, not profitable.");
return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6c717ca1390a638..d6e80dd9a7c3b8d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15617,25 +15617,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
NumBytes = 0;
}
- if (!AM.Scale) {
- int64_t Offset = AM.BaseOffs;
-
- // 9-bit signed offset
- if (isInt<9>(Offset))
- return true;
-
- // 12-bit unsigned offset
- unsigned shift = Log2_64(NumBytes);
- if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
- // Must be a multiple of NumBytes (NumBytes is a power of 2)
- (Offset >> shift) << shift == Offset)
- return true;
- return false;
- }
-
- // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
-
- return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
+ return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
+ AM.Scale);
}
bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4666e7b1614afa3..c6a59ec44ef80e9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2681,6 +2681,727 @@ AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
return AM;
}
+bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
+ Register Reg,
+ const MachineInstr &AddrI,
+ ExtAddrMode &AM) const {
+ // Filter out instructions into which we cannot fold.
+ unsigned NumBytes;
+ int64_t OffsetScale = 1;
+ switch (MemI.getOpcode()) {
+ default:
+ return false;
+
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ NumBytes = 16;
+ break;
+
+ case AArch64::LDURDi:
+ case AArch64::STURDi:
+ case AArch64::LDURXi:
+ case AArch64::STURXi:
+ NumBytes = 8;
+ break;
+
+ case AArch64::LDURWi:
+ case AArch64::LDURSWi:
+ case AArch64::STURWi:
+ NumBytes = 4;
+ break;
+
+ case AArch64::LDURHi:
+ case AArch64::STURHi:
+ case AArch64::LDURHHi:
+ case AArch64::STURHHi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSHWi:
+ NumBytes = 2;
+ break;
+
+ case AArch64::LDRBroX:
+ case AArch64::LDRBBroX:
+ case AArch64::LDRSBXroX:
+ case AArch64::LDRSBWroX:
+ case AArch64::STRBroX:
+ case AArch64::STRBBroX:
+ case AArch64::LDURBi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSBWi:
+ case AArch64::STURBi:
+ case AArch64::STURBBi:
+ case AArch64::LDRBui:
+ case AArch64::LDRBBui:
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSBWui:
+ case AArch64::STRBui:
+ case AArch64::STRBBui:
+ NumBytes = 1;
+ break;
+
+ case AArch64::LDRQroX:
+ case AArch64::STRQroX:
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ NumBytes = 16;
+ OffsetScale = 16;
+ break;
+
+ case AArch64::LDRDroX:
+ case AArch64::STRDroX:
+ case AArch64::LDRXroX:
+ case AArch64::STRXroX:
+ case AArch64::LDRDui:
+ case AArch64::STRDui:
+ case AArch64::LDRXui:
+ case AArch64::STRXui:
+ NumBytes = 8;
+ OffsetScale = 8;
+ break;
+
+ case AArch64::LDRWroX:
+ case AArch64::LDRSWroX:
+ case AArch64::STRWroX:
+ case AArch64::LDRWui:
+ case AArch64::LDRSWui:
+ case AArch64::STRWui:
+ NumBytes = 4;
+ OffsetScale = 4;
+ break;
+
+ case AArch64::LDRHroX:
+ case AArch64::STRHroX:
+ case AArch64::LDRHHroX:
+ case AArch64::STRHHroX:
+ case AArch64::LDRSHXroX:
+ case AArch64::LDRSHWroX:
+ case AArch64::LDRHui:
+ case AArch64::STRHui:
+ case AArch64::LDRHHui:
+ case AArch64::STRHHui:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSHWui:
+ NumBytes = 2;
+ OffsetScale = 2;
+ break;
+ }
+
+ // Check the fold operand is not the loaded/stored value.
+ const MachineOperand &BaseRegOp = MemI.getOperand(0);
+ if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
+ return false;
+
+ // Handle memory instructions with a [Reg, Reg] addressing mode.
+ if (MemI.getOperand(2).isReg()) {
+ // Bail if the addressing mode already includes extension of the offset
+ // register.
+ if (MemI.getOperand(3).getImm())
+ return false;
+
+ // Check if we actually have a scaled offset.
+ if (MemI.getOperand(4).getImm() == 0)
+ OffsetScale = 1;
+
+ // If the address instructions is folded into the base register, then the
+ // addressing mode must not have a scale. Then we can swap the base and the
+ // scaled registers.
+ if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
+ return false;
+
+ switch (AddrI.getOpcode()) {
+ default:
+ return false;
+
+ case AArch64::SBFMXri:
+ // sxtw Xa, Wm
+ // ldr Xd, [Xn, Xa, lsl #N]
+ // ->
+ // ldr Xd, [Xn, Wm, sxtw #N]
+ if (AddrI.getOperand(2).getImm() != 0 ||
+ AddrI.getOperand(3).getImm() != 31)
+ return false;
+
+ AM.BaseReg = MemI.getOperand(1).getReg();
+ if (AM.BaseReg == Reg)
+ AM.BaseReg = MemI.getOperand(2).getReg();
+ AM.ScaledReg = AddrI.getOperand(1).getReg();
+ AM.Scale = OffsetScale;
+ AM.Displacement = 0;
+ AM.Form = ExtAddrMode::Formula::SExtScaledReg;
+ return true;
+
+ case TargetOpcode::SUBREG_TO_REG: {
+ // mov Wa, Wm
+ // ldr Xd, [Xn, Xa, lsl #N]
+ // ->
+ // ldr Xd, [Xn, Wm, uxtw #N]
+
+ // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
+ if (AddrI.getOperand(1).getImm() != 0 ||
+ AddrI.getOperand(3).getImm() != AArch64::sub_32)
+ return false;
+
+ const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
+ Register OffsetReg = AddrI.getOperand(2).getReg();
+ if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
+ return false;
+
+ const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
+ if (DefMI.getOpcode() != AArch64::ORRWrs ||
+ DefMI.getOperand(1).getReg() != AArch64::WZR ||
+ DefMI.getOperand(3).getImm() != 0)
+ return false;
+
+ AM.BaseReg = MemI.getOperand(1).getReg();
+ if (AM.BaseReg == Reg)
+ AM.BaseReg = MemI.getOperand(2).getReg();
+ AM.ScaledReg = DefMI.getOperand(2).getReg();
+ AM.Scale = OffsetScale;
+ AM.Displacement = 0;
+ AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
+ return true;
+ }
+ }
+ }
+
+ // Handle memory instructions with a [Reg, #Imm] addressing mode.
+ auto canFoldAddSubImmIntoAddrMode = [&](int64_t Offset) -> bool {
+ Offset += MemI.getOperand(2).getImm() * OffsetScale;
+ if (!isLegalAddressingMode(NumBytes, Offset, /* Scale */ 0))
+ return false;
+ AM.BaseReg = AddrI.getOperand(1).getReg();
+ AM.ScaledReg = 0;
+ AM.Scale = 0;
+ AM.Displacement = Offset;
+ AM.Form = ExtAddrMode::Formula::Basic;
+ return true;
+ };
+
+ auto canFoldAddRegIntoAddrMode =
+ [&](int64_t Scale,
+ ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
+ if (MemI.getOperand(2).getImm() != 0)
+ return false;
+ if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
+ return false;
+ AM.BaseReg = AddrI.getOperand(1).getReg();
+ AM.ScaledReg = AddrI.getOperand(2).getReg();
+ AM.Scale = Scale;
+ AM.Displacement = 0;
+ AM.Form = Form;
+ return true;
+ };
+
+ auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
+ unsigned Opcode = MemI.getOpcode();
+ return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
+ Subtarget.isSTRQroSlow();
+ };
+
+ int64_t Offset = 0;
+ const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
+ switch (AddrI.getOpcode()) {
+ default:
+ return false;
+
+ case AArch64::ADDXri:
+ // add Xa, Xn, #N
+ // ldr Xd, [Xa, #M]
+ // ->
+ // ldr Xd, [Xn, #N'+M]
+ Offset = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
+ return canFoldAddSubImmIntoAddrMode(Offset);
+
+ case AArch64::SUBXri:
+ // sub Xa, Xn, #N
+ // ldr Xd, [Xa, #M]
+ // ->
+ // ldr Xd, [Xn, #N'+M]
+ Offset = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
+ return canFoldAddSubImmIntoAddrMode(-Offset);
+
+ case AArch64::ADDXrs: {
+ // add Xa, Xn, Xm, lsl #N
+ // ldr Xd, [Xa]
+ // ->
+ // ldr Xd, [Xn, Xm, lsl #N]
+
+ // Don't fold the add if the result would be slower, unless optimising for
+ // size.
+ int64_t Shift = AddrI.getOperand(3).getImm();
+ if (!OptSize) {
+ if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast())
+ return false;
+ if (avoidSlowSTRQ(MemI))
+ return false;
+ }
+ return canFoldAddRegIntoAddrMode(1 << Shift);
+ }
+
+ case AArch64::ADDXrr:
+ // add Xa, Xn, Xm
+ // ldr Xd, [Xa]
+ // ->
+ // ldr Xd, [Xn, Xm, lsl #0]
+
+ // Don't fold the add if the result would be slower, unless optimising for
+ // size.
+ if (!OptSize && avoidSlowSTRQ(MemI))
+ return false;
+ return canFoldAddRegIntoAddrMode(1);
+
+ case AArch64::ADDXrx:
+ // add Xa, Xn, Wm, {s,u}xtw #N
+ // ldr Xd, [Xa]
+ // ->
+ // ldr Xd, [Xn, Wm, {s,u}xtw #N]
+
+ // Don't fold the add if the result would be slower, unless optimising for
+ // size.
+ if (!OptSize && avoidSlowSTRQ(MemI))
+ return false;
+
+ // Can fold only sign-/zero-extend of a word.
+ unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
+ AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
+ if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
+ return false;
+
+ return canFoldAddRegIntoAddrMode(1 << AArch64_AM::getArithShiftValue(Imm),
+ (Extend == AArch64_AM::SXTW)
+ ? ExtAddrMode::Formula::SExtScaledReg
+ : ExtAddrMode::Formula::ZExtScaledReg);
+ }
+}
+
+// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
+// return the opcode of an instruction performing the same operation, but using
+// the [Reg, Reg] addressing mode.
+static unsigned regOffsetOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Address folding not implemented for instruction");
+
+ case AArch64::LDURQi:
+ case AArch64::LDRQui:
+ return AArch64::LDRQroX;
+ case AArch64::STURQi:
+ case AArch64::STRQui:
+ return AArch64::STRQroX;
+ case AArch64::LDURDi:
+ case AArch64::LDRDui:
+ return AArch64::LDRDroX;
+ case AArch64::STURDi:
+ case AArch64::STRDui:
+ return AArch64::STRDroX;
+ case AArch64::LDURXi:
+ case AArch64::LDRXui:
+ return AArch64::LDRXroX;
+ case AArch64::STURXi:
+ case AArch64::STRXui:
+ return AArch64::STRXroX;
+ case AArch64::LDURWi:
+ case AArch64::LDRWui:
+ return AArch64::LDRWroX;
+ case AArch64::LDURSWi:
+ case AArch64::LDRSWui:
+ return AArch64::LDRSWroX;
+ case AArch64::STURWi:
+ case AArch64::STRWui:
+ return AArch64::STRWroX;
+ case AArch64::LDURHi:
+ case AArch64::LDRHui:
+ return AArch64::LDRHroX;
+ case AArch64::STURHi:
+ case AArch64::STRHui:
+ return AArch64::STRHroX;
+ case AArch64::LDURHHi:
+ case AArch64::LDRHHui:
+ return AArch64::LDRHHroX;
+ case AArch64::STURHHi:
+ case AArch64::STRHHui:
+ return AArch64::STRHHroX;
+ case AArch64::LDURSHXi:
+ case AArch64::LDRSHXui:
+ return AArch64::LDRSHXroX;
+ case AArch64::LDURSHWi:
+ case AArch64::LDRSHWui:
+ return AArch64::LDRSHWroX;
+ case AArch64::LDURBi:
+ case AArch64::LDRBui:
+ return AArch64::LDRBroX;
+ case AArch64::LDURBBi:
+ case AArch64::LDRBBui:
+ return AArch64::LDRBBroX;
+ case AArch64::LDURSBXi:
+ case AArch64::LDRSBXui:
+ return AArch64::LDRSBXroX;
+ case AArch64::LDURSBWi:
+ case AArch64::LDRSBWui:
+ return AArch64::LDRSBWroX;
+ case AArch64::STURBi:
+ case AArch64::STRBui:
+ return AArch64::STRBroX;
+ case AArch64::STURBBi:
+ case AArch64::STRBBui:
+ return AArch64::STRBBroX;
+ }
+}
+
+// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
+// the opcode of an instruction performing the same operation, but using the
+// [Reg, #Imm] addressing mode with scaled offset.
+unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Address folding not implemented for instruction");
+
+ case AArch64::LDURQi:
+ Scale = 16;
+ return AArch64::LDRQui;
+ case AArch64::STURQi:
+ Scale = 16;
+ return AArch64::STRQui;
+ case AArch64::LDURDi:
+ Scale = 8;
+ return AArch64::LDRDui;
+ case AArch64::STURDi:
+ Scale = 8;
+ return AArch64::STRDui;
+ case AArch64::LDURXi:
+ Scale = 8;
+ return AArch64::LDRXui;
+ case AArch64::STURXi:
+ Scale = 8;
+ return AArch64::STRXui;
+ case AArch64::LDURWi:
+ Scale = 4;
+ return AArch64::LDRWui;
+ case AArch64::LDURSWi:
+ Scale = 4;
+ return AArch64::LDRSWui;
+ case AArch64::STURWi:
+ Scale = 4;
+ return AArch64::STRWui;
+ case AArch64::LDURHi:
+ Scale = 2;
+ return AArch64::LDRHui;
+ case AArch64::STURHi:
+ Scale = 2;
+ return AArch64::STRHui;
+ case AArch64::LDURHHi:
+ Scale = 2;
+ return AArch64::LDRHHui;
+ case AArch64::STURHHi:
+ Scale = 2;
+ return AArch64::STRHHui;
+ case AArch64::LDURSHXi:
+ Scale = 2;
+ return AArch64::LDRSHXui;
+ case AArch64::LDURSHWi:
+ Scale = 2;
+ return AArch64::LDRSHWui;
+ case AArch64::LDURBi:
+ Scale = 1;
+ return AArch64::LDRBui;
+ case AArch64::LDURBBi:
+ Scale = 1;
+ return AArch64::LDRBBui;
+ case AArch64::LDURSBXi:
+ Scale = 1;
+ return AArch64::LDRSBXui;
+ case AArch64::LDURSBWi:
+ Scale = 1;
+ return AArch64::LDRSBWui;
+ case AArch64::STURBi:
+ Scale = 1;
+ return AArch64::STRBui;
+ case AArch64::STURBBi:
+ Scale = 1;
+ return AArch64::STRBBui;
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ Scale = 16;
+ return Opcode;
+ case AArch64::LDRDui:
+ case AArch64::STRDui:
+ case AArch64::LDRXui:
+ case AArch64::STRXui:
+ Scale = 8;
+ return Opcode;
+ case AArch64::LDRWui:
+ case AArch64::LDRSWui:
+ case AArch64::STRWui:
+ Scale = 4;
+ return Opcode;
+ case AArch64::LDRHui:
+ case AArch64::STRHui:
+ case AArch64::LDRHHui:
+ case AArch64::STRHHui:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSHWui:
+ Scale = 2;
+ return Opcode;
+ case AArch64::LDRBui:
+ case AArch64::LDRBBui:
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSBWui:
+ case AArch64::STRBui:
+ case AArch64::STRBBui:
+ Scale = 1;
+ return Opcode;
+ }
+}
+
+// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
+// the opcode of an instruction performing the same operation, but using the
+// [Reg, #Imm] addressing mode with unscaled offset.
+unsigned unscaledOffsetOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Address folding not implemented for instruction");
+
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ case AArch64::LDURDi:
+ case AArch64::STURDi:
+ case AArch64::LDURXi:
+ case AArch64::STURXi:
+ case AArch64::LDURWi:
+ case AArch64::LDURSWi:
+ case AArch64::STURWi:
+ case AArch64::LDURHi:
+ case AArch64::STURHi:
+ case AArch64::LDURHHi:
+ case AArch64::STURHHi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSHWi:
+ case AArch64::LDURBi:
+ case AArch64::STURBi:
+ case AArch64::LDURBBi:
+ case AArch64::STURBBi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSBXi:
+ return Opcode;
+ case AArch64::LDRQui:
+ return AArch64::LDURQi;
+ case AArch64::STRQui:
+ return AArch64::STURQi;
+ case AArch64::LDRDui:
+ return AArch64::LDURDi;
+ case AArch64::STRDui:
+ return AArch64::STURDi;
+ case AArch64::LDRXui:
+ return AArch64::LDURXi;
+ case AArch64::STRXui:
+ return AArch64::STURXi;
+ case AArch64::LDRWui:
+ return AArch64::LDURWi;
+ case AArch64::LDRSWui:
+ return AArch64::LDURSWi;
+ case AArch64::STRWui:
+ return AArch64::STURWi;
+ case AArch64::LDRHui:
+ return AArch64::LDURHi;
+ case AArch64::STRHui:
+ return AArch64::STURHi;
+ case AArch64::LDRHHui:
+ return AArch64::LDURHHi;
+ case AArch64::STRHHui:
+ return AArch64::STURHHi;
+ case AArch64::LDRSHXui:
+ return AArch64::LDURSHXi;
+ case AArch64::LDRSHWui:
+ return AArch64::LDURSHWi;
+ case AArch64::LDRBBui:
+ return AArch64::LDURBBi;
+ case AArch64::LDRBui:
+ return AArch64::LDURBi;
+ case AArch64::STRBBui:
+ return AArch64::STURBBi;
+ case AArch64::STRBui:
+ return AArch64::STURBi;
+ case AArch64::LDRSBWui:
+ return AArch64::LDURSBWi;
+ case AArch64::LDRSBXui:
+ return AArch64::LDURSBXi;
+ }
+}
+
+// Given the opcode of a memory load/store instruction, return the opcode of an
+// instruction performing the same operation, but using
+// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
+// offset register.
+static unsigned offsetExtendOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Address folding not implemented for instruction");
+
+ case AArch64::LDRQroX:
+ case AArch64::LDURQi:
+ case AArch64::LDRQui:
+ return AArch64::LDRQroW;
+ case AArch64::STRQroX:
+ case AArch64::STURQi:
+ case AArch64::STRQui:
+ return AArch64::STRQroW;
+ case AArch64::LDRDroX:
+ case AArch64::LDURDi:
+ case AArch64::LDRDui:
+ return AArch64::LDRDroW;
+ case AArch64::STRDroX:
+ case AArch64::STURDi:
+ case AArch64::STRDui:
+ return AArch64::STRDroW;
+ case AArch64::LDRXroX:
+ case AArch64::LDURXi:
+ case AArch64::LDRXui:
+ return AArch64::LDRXroW;
+ case AArch64::STRXroX:
+ case AArch64::STURXi:
+ case AArch64::STRXui:
+ return AArch64::STRXroW;
+ case AArch64::LDRWroX:
+ case AArch64::LDURWi:
+ case AArch64::LDRWui:
+ return AArch64::LDRWroW;
+ case AArch64::LDRSWroX:
+ case AArch64::LDURSWi:
+ case AArch64::LDRSWui:
+ return AArch64::LDRSWroW;
+ case AArch64::STRWroX:
+ case AArch64::STURWi:
+ case AArch64::STRWui:
+ return AArch64::STRWroW;
+ case AArch64::LDRHroX:
+ case AArch64::LDURHi:
+ case AArch64::LDRHui:
+ return AArch64::LDRHroW;
+ case AArch64::STRHroX:
+ case AArch64::STURHi:
+ case AArch64::STRHui:
+ return AArch64::STRHroW;
+ case AArch64::LDRHHroX:
+ case AArch64::LDURHHi:
+ case AArch64::LDRHHui:
+ return AArch64::LDRHHroW;
+ case AArch64::STRHHroX:
+ case AArch64::STURHHi:
+ case AArch64::STRHHui:
+ return AArch64::STRHHroW;
+ case AArch64::LDRSHXroX:
+ case AArch64::LDURSHXi:
+ case AArch64::LDRSHXui:
+ return AArch64::LDRSHXroW;
+ case AArch64::LDRSHWroX:
+ case AArch64::LDURSHWi:
+ case AArch64::LDRSHWui:
+ return AArch64::LDRSHWroW;
+ case AArch64::LDRBroX:
+ case AArch64::LDURBi:
+ case AArch64::LDRBui:
+ return AArch64::LDRBroW;
+ case AArch64::LDRBBroX:
+ case AArch64::LDURBBi:
+ case AArch64::LDRBBui:
+ return AArch64::LDRBBroW;
+ case AArch64::LDRSBXroX:
+ case AArch64::LDURSBXi:
+ case AArch64::LDRSBXui:
+ return AArch64::LDRSBXroW;
+ case AArch64::LDRSBWroX:
+ case AArch64::LDURSBWi:
+ case AArch64::LDRSBWui:
+ return AArch64::LDRSBWroW;
+ case AArch64::STRBroX:
+ case AArch64::STURBi:
+ case AArch64::STRBui:
+ return AArch64::STRBroW;
+ case AArch64::STRBBroX:
+ case AArch64::STURBBi:
+ case AArch64::STRBBui:
+ return AArch64::STRBBroW;
+ }
+}
+
+MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
+ const ExtAddrMode &AM) const {
+
+ const DebugLoc &DL = MemI.getDebugLoc();
+ MachineBasicBlock &MBB = *MemI.getParent();
+ MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
+
+ if (AM.Form == ExtAddrMode::Formula::Basic) {
+ if (AM.ScaledReg) {
+ // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
+ unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
+ MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
+ auto B = BuildMI(MBB, MemI, DL, get(Opcode))
+ .addReg(MemI.getOperand(0).getReg(),
+ MemI.mayLoad() ? RegState::Define : 0)
+ .addReg(AM.BaseReg)
+ .addReg(AM.ScaledReg)
+ .addImm(0)
+ .addImm(AM.Scale > 1)
+ .setMemRefs(MemI.memoperands())
+ .setMIFlags(MemI.getFlags());
+ return B.getInstr();
+ }
+
+ assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
+ "Addressing mode not supported for folding");
+
+ // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
+ unsigned Scale = 1;
+ unsigned Opcode = MemI.getOpcode();
+ if (isInt<9>(AM.Displacement))
+ Opcode = unscaledOffsetOpcode(Opcode);
+ else
+ Opcode = scaledOffsetOpcode(Opcode, Scale);
+
+ auto B = BuildMI(MBB, MemI, DL, get(Opcode))
+ .addReg(MemI.getOperand(0).getReg(),
+ MemI.mayLoad() ? RegState::Define : 0)
+ .addReg(AM.BaseReg)
+ .addImm(AM.Displacement / Scale)
+ .setMemRefs(MemI.memoperands())
+ .setMIFlags(MemI.getFlags());
+ return B.getInstr();
+ }
+
+ if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
+ AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
+ // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
+ assert(AM.ScaledReg && !AM.Displacement &&
+ "Address offset can be a register or an immediate, but not both");
+ unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
+ MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
+ // Make sure the offset register is in the correct register class.
+ Register OffsetReg = AM.ScaledReg;
+ const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
+ if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
+ OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
+ .addReg(AM.ScaledReg, 0, AArch64::sub_32);
+ }
+ auto B = BuildMI(MBB, MemI, DL, get(Opcode))
+ .addReg(MemI.getOperand(0).getReg(),
+ MemI.mayLoad() ? RegState::Define : 0)
+ .addReg(AM.BaseReg)
+ .addReg(OffsetReg)
+ .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
+ .addImm(AM.Scale != 1)
+ .setMemRefs(MemI.memoperands())
+ .setMIFlags(MemI.getFlags());
+
+ return B.getInstr();
+ }
+
+ llvm_unreachable(
+ "Function must not be called with an addressing mode it can't handle");
+}
+
bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
bool &OffsetIsScalable, unsigned &Width,
@@ -8571,6 +9292,30 @@ AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
}
+bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
+ unsigned Scale) const {
+ if (Offset && Scale)
+ return false;
+
+ // Check Reg + Imm
+ if (!Scale) {
+ // 9-bit signed offset
+ if (isInt<9>(Offset))
+ return true;
+
+ // 12-bit unsigned offset
+ unsigned Shift = Log2_64(NumBytes);
+ if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
+ // Must be a multiple of NumBytes (NumBytes is a power of 2)
+ (Offset >> Shift) << Shift == Offset)
+ return true;
+ return false;
+ }
+
+ // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+ return Scale == 1 || (Scale > 0 && Scale == NumBytes);
+}
+
unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
return AArch64::BLRNoIP;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 4a4d87c1b1f6ba5..f1a4928939bcd9e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -140,6 +140,13 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
getAddrModeFromMemoryOp(const MachineInstr &MemI,
const TargetRegisterInfo *TRI) const override;
+ bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
+ const MachineInstr &AddrI,
+ ExtAddrMode &AM) const override;
+
+ MachineInstr *emitLdStWithAddr(MachineInstr &MemI,
+ const ExtAddrMode &AM) const override;
+
bool getMemOperandsWithOffsetWidth(
const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
@@ -362,6 +369,13 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
int64_t &VGSized);
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
+
+ // Return true if address of the form BaseReg + Scale * ScaledReg + Offset can
+ // be used for a load/store of NumBytes. BaseReg is always present and
+ // implicit.
+ bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
+ unsigned Scale) const;
+
#define GET_INSTRINFO_HELPER_DECLS
#include "AArch64GenInstrInfo.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 2c75a478f52c293..3d818c76bd4b7d7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -197,6 +197,11 @@ static cl::opt<bool> EnableGISelLoadStoreOptPostLegal(
cl::desc("Enable GlobalISel's post-legalizer load/store optimization pass"),
cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ EnableSinkFold("aarch64-enable-sink-fold",
+ cl::desc("Enable sinking and folding of instruction copies"),
+ cl::init(false), cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
// Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -472,6 +477,7 @@ class AArch64PassConfig : public TargetPassConfig {
: TargetPassConfig(TM, PM) {
if (TM.getOptLevel() != CodeGenOptLevel::None)
substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
+ setEnableSinkAndFold(EnableSinkFold);
}
AArch64TargetMachine &getAArch64TargetMachine() const {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 73675a868239ea1..4320a0e94b7a71f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3842,7 +3842,7 @@ bool X86InstrInfo::verifyInstruction(const MachineInstr &MI,
return true;
ExtAddrMode AM = *AMOrNone;
-
+ assert(AM.Form == ExtAddrMode::Formula::Basic);
if (AM.ScaledReg != X86::NoRegister) {
switch (AM.Scale) {
case 1:
diff --git a/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll b/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll
index 6b1c6fa674602d5..a72f9df9e496fd7 100644
--- a/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll
+++ b/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -o - | FileCheck %s
-; RUN: llc -mattr=+alu-lsl-fast < %s -o - | FileCheck %s -check-prefix=LSLFAST
+; RUN: llc < %s -o - | FileCheck %s
+; RUN: llc -mattr=+alu-lsl-fast --aarch64-enable-sink-fold=false < %s -o - | FileCheck %s -check-prefix=LSLFAST
target triple = "aarch64-linux"
declare void @g(...)
diff --git a/llvm/test/CodeGen/AArch64/align-down.ll b/llvm/test/CodeGen/AArch64/align-down.ll
index cda0d1304b1c5d2..767a1dff445d18d 100644
--- a/llvm/test/CodeGen/AArch64/align-down.ll
+++ b/llvm/test/CodeGen/AArch64/align-down.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu --aarch64-enable-sink-fold=true < %s | FileCheck %s
; Fold
; ptr - (ptr & (alignment-1))
@@ -55,9 +55,9 @@ define i32 @t3_extrause0(i32 %ptr, i32 %alignment, i32* %mask_storage) nounwind
; CHECK-LABEL: t3_extrause0:
; CHECK: // %bb.0:
; CHECK-NEXT: neg w8, w1
+; CHECK-NEXT: sub w9, w1, #1
; CHECK-NEXT: and w0, w0, w8
-; CHECK-NEXT: sub w8, w1, #1
-; CHECK-NEXT: str w8, [x2]
+; CHECK-NEXT: str w9, [x2]
; CHECK-NEXT: ret
%mask = add i32 %alignment, -1
store i32 %mask, i32* %mask_storage
diff --git a/llvm/test/CodeGen/AArch64/and-mask-removal.ll b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
index fa618ef6ac37bba..17ff01597016893 100644
--- a/llvm/test/CodeGen/AArch64/and-mask-removal.ll
+++ b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-apple-darwin -aarch64-enable-collect-loh=false < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-apple-darwin -aarch64-enable-collect-loh=false -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-apple-darwin -aarch64-enable-collect-loh=false -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-apple-darwin -aarch64-enable-collect-loh=false -aarch64-enable-sink-fold=true -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
@board = common global [400 x i8] zeroinitializer, align 1
@next_string = common global i32 0, align 4
@@ -10,21 +10,20 @@
define void @new_position(i32 %pos) {
; CHECK-SD-LABEL: new_position:
; CHECK-SD: ; %bb.0: ; %entry
-; CHECK-SD-NEXT: adrp x9, _board at GOTPAGE
+; CHECK-SD-NEXT: adrp x8, _board at GOTPAGE
; CHECK-SD-NEXT: ; kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: sxtw x8, w0
-; CHECK-SD-NEXT: ldr x9, [x9, _board at GOTPAGEOFF]
-; CHECK-SD-NEXT: ldrb w9, [x9, x8]
-; CHECK-SD-NEXT: sub w9, w9, #1
-; CHECK-SD-NEXT: cmp w9, #1
+; CHECK-SD-NEXT: ldr x8, [x8, _board at GOTPAGEOFF]
+; CHECK-SD-NEXT: ldrb w8, [x8, w0, sxtw]
+; CHECK-SD-NEXT: sub w8, w8, #1
+; CHECK-SD-NEXT: cmp w8, #1
; CHECK-SD-NEXT: b.hi LBB0_2
; CHECK-SD-NEXT: ; %bb.1: ; %if.then
-; CHECK-SD-NEXT: adrp x9, _next_string at GOTPAGE
-; CHECK-SD-NEXT: adrp x10, _string_number at GOTPAGE
-; CHECK-SD-NEXT: ldr x9, [x9, _next_string at GOTPAGEOFF]
-; CHECK-SD-NEXT: ldr x10, [x10, _string_number at GOTPAGEOFF]
-; CHECK-SD-NEXT: ldr w9, [x9]
-; CHECK-SD-NEXT: str w9, [x10, x8, lsl #2]
+; CHECK-SD-NEXT: adrp x8, _next_string at GOTPAGE
+; CHECK-SD-NEXT: adrp x9, _string_number at GOTPAGE
+; CHECK-SD-NEXT: ldr x8, [x8, _next_string at GOTPAGEOFF]
+; CHECK-SD-NEXT: ldr x9, [x9, _string_number at GOTPAGEOFF]
+; CHECK-SD-NEXT: ldr w8, [x8]
+; CHECK-SD-NEXT: str w8, [x9, w0, sxtw #2]
; CHECK-SD-NEXT: LBB0_2: ; %if.end
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
index e8c1c124c06bca8..d943afb23c03b2e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -aarch64-load-store-renaming=true < %s -mtriple=arm64-apple-ios7.0.0 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc -aarch64-load-store-renaming=true -aarch64-enable-sink-fold=true < %s -mtriple=arm64-apple-ios7.0.0 -mcpu=cyclone -enable-misched=false | FileCheck %s
; rdar://13625505
; Here we have 9 fixed integer arguments the 9th argument in on stack, the
diff --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
index 49ddc6f6147805f..a1e06936852726c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s --check-prefix=CHECK-SDAG
-; RUN: llc < %s -global-isel -global-isel-abort=2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s --check-prefix=CHECK-GISEL
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast -aarch64-enable-sink-fold=true | FileCheck %s --check-prefix=CHECK-SDAG
+; RUN: llc < %s -global-isel -global-isel-abort=2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast -aarch64-enable-sink-fold=true | FileCheck %s --check-prefix=CHECK-GISEL
define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
; CHECK-SDAG-LABEL: test_varidx_extract_v8s8:
@@ -28,8 +28,8 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
; CHECK-GISEL-NEXT: sub sp, sp, #16
; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
; CHECK-GISEL-NEXT: mov w9, w0
-; CHECK-GISEL-NEXT: add x8, sp, #8
; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GISEL-NEXT: add x8, sp, #8
; CHECK-GISEL-NEXT: str d0, [sp, #8]
; CHECK-GISEL-NEXT: and x9, x9, #0x7
; CHECK-GISEL-NEXT: mov b2, v0.b[1]
@@ -37,8 +37,7 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
; CHECK-GISEL-NEXT: lsl x10, x9, #1
; CHECK-GISEL-NEXT: mov b0, v0.b[3]
; CHECK-GISEL-NEXT: sub x9, x10, x9
-; CHECK-GISEL-NEXT: add x8, x8, x9
-; CHECK-GISEL-NEXT: ldrb w8, [x8]
+; CHECK-GISEL-NEXT: ldrb w8, [x8, x9]
; CHECK-GISEL-NEXT: fmov s1, w8
; CHECK-GISEL-NEXT: mov v1.h[1], v2.h[0]
; CHECK-GISEL-NEXT: mov v1.h[2], v3.h[0]
diff --git a/llvm/test/CodeGen/AArch64/arm64-long-shift.ll b/llvm/test/CodeGen/AArch64/arm64-long-shift.ll
index a0072a6e1963059..ec72f669cc61f05 100644
--- a/llvm/test/CodeGen/AArch64/arm64-long-shift.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-long-shift.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64-eabi -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -mcpu=cyclone -aarch64-enable-sink-fold=true | FileCheck %s
define i128 @shl(i128 %r, i128 %s) nounwind readnone {
; CHECK-LABEL: shl:
@@ -26,8 +26,8 @@ define i128 @shl_mask(i128 %r, i128 %s) nounwind readnone {
; CHECK-NEXT: and x10, x2, #0x3f
; CHECK-NEXT: eor x10, x10, #0x3f
; CHECK-NEXT: lsr x9, x9, x10
-; CHECK-NEXT: orr x1, x8, x9
; CHECK-NEXT: lsl x0, x0, x2
+; CHECK-NEXT: orr x1, x8, x9
; CHECK-NEXT: ret
%mask = and i128 %s, 63
%shl = shl i128 %r, %mask
@@ -60,8 +60,8 @@ define i128 @ashr_mask(i128 %r, i128 %s) nounwind readnone {
; CHECK-NEXT: and x10, x2, #0x3f
; CHECK-NEXT: eor x10, x10, #0x3f
; CHECK-NEXT: lsl x9, x9, x10
-; CHECK-NEXT: orr x0, x9, x8
; CHECK-NEXT: asr x1, x1, x2
+; CHECK-NEXT: orr x0, x9, x8
; CHECK-NEXT: ret
%mask = and i128 %s, 63
%shr = ashr i128 %r, %mask
@@ -93,8 +93,8 @@ define i128 @lshr_mask(i128 %r, i128 %s) nounwind readnone {
; CHECK-NEXT: and x10, x2, #0x3f
; CHECK-NEXT: eor x10, x10, #0x3f
; CHECK-NEXT: lsl x9, x9, x10
-; CHECK-NEXT: orr x0, x9, x8
; CHECK-NEXT: lsr x1, x1, x2
+; CHECK-NEXT: orr x0, x9, x8
; CHECK-NEXT: ret
%mask = and i128 %s, 63
%shr = lshr i128 %r, %mask
diff --git a/llvm/test/CodeGen/AArch64/arm64-stp.ll b/llvm/test/CodeGen/AArch64/arm64-stp.ll
index b953e7151a83cee..a393fcb17fcb2bc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-stp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-stp.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-enable-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-enable-stp-suppress=false -verify-machineinstrs -mcpu=cyclone -aarch64-enable-sink-fold=true | FileCheck %s
define void @stp_int(i32 %a, i32 %b, ptr nocapture %p) nounwind {
; CHECK-LABEL: stp_int:
@@ -182,9 +182,8 @@ define i32 @stp_int_rar_hazard(i32 %a, i32 %b, ptr nocapture %p) nounwind {
; CHECK-LABEL: stp_int_rar_hazard:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr w8, [x2, #8]
-; CHECK-NEXT: add w8, w8, w1
; CHECK-NEXT: stp w0, w1, [x2]
-; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: add w0, w8, w1
; CHECK-NEXT: ret
store i32 %a, ptr %p, align 4
%ld.ptr = getelementptr inbounds i32, ptr %p, i64 2
@@ -200,8 +199,8 @@ define i32 @stp_int_rar_hazard_after(i32 %w0, i32 %a, i32 %b, ptr nocapture %p)
; CHECK-LABEL: stp_int_rar_hazard_after:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr w8, [x3, #4]
-; CHECK-NEXT: add w0, w8, w2
; CHECK-NEXT: stp w1, w2, [x3]
+; CHECK-NEXT: add w0, w8, w2
; CHECK-NEXT: ret
store i32 %a, ptr %p, align 4
%ld.ptr = getelementptr inbounds i32, ptr %p, i64 1
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll b/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
index fee4fd839554c11..ad073d96a148300 100644
--- a/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64_32-apple-ios %s -aarch64-enable-sink-fold=true -o - | FileCheck %s
; If %base < 96 then the sum will not wrap (in an unsigned sense), but "ldr w0,
; [x0, #-96]" would.
@@ -42,10 +42,9 @@ define i8 @test_valid_wrap_optimizable1(ptr %base, i32 %offset) {
define i8 @test_valid_wrap_optimizable2(ptr %base, i32 %offset) {
; CHECK-LABEL: test_valid_wrap_optimizable2:
; CHECK: ; %bb.0:
+; CHECK-NEXT: mov w8, #-100 ; =0xffffff9c
; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: sxtw x8, w1
-; CHECK-NEXT: mov w9, #-100 ; =0xffffff9c
-; CHECK-NEXT: ldrb w0, [x8, x9]
+; CHECK-NEXT: ldrb w0, [x8, w1, sxtw]
; CHECK-NEXT: ret
%newaddr = getelementptr inbounds i8, ptr inttoptr(i32 -100 to ptr), i32 %offset
diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
index 8b9e66c166498a4..70f3b5cc488ea8b 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -mattr=+outline-atomics < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
-; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s --check-prefix=CHECK-REG
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -aarch64-enable-sink-fold=true < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -mattr=+outline-atomics -aarch64-enable-sink-fold=true < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -aarch64-enable-sink-fold=true < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefix=CHECK-REG
; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
; (i.e. reusing a register for status & data in store exclusive).
@@ -1713,9 +1713,9 @@ define dso_local i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i8:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var8
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var8
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr1_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -1735,9 +1735,9 @@ define dso_local i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i16:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var16
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var16
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr2_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -1757,9 +1757,9 @@ define dso_local i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -1779,9 +1779,9 @@ define dso_local i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -1965,9 +1965,9 @@ define dso_local void @test_atomic_load_and_i32_noret(i32 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_noret:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -1987,9 +1987,9 @@ define dso_local void @test_atomic_load_and_i64_noret(i64 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_noret:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2629,9 +2629,9 @@ define dso_local i8 @test_atomic_load_and_i8_acq_rel(i8 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i8_acq_rel:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var8
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var8
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr1_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2651,9 +2651,9 @@ define dso_local i16 @test_atomic_load_and_i16_acq_rel(i16 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i16_acq_rel:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var16
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var16
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr2_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2673,9 +2673,9 @@ define dso_local i32 @test_atomic_load_and_i32_acq_rel(i32 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_acq_rel:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2695,9 +2695,9 @@ define dso_local i64 @test_atomic_load_and_i64_acq_rel(i64 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_acq_rel:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2717,9 +2717,9 @@ define dso_local void @test_atomic_load_and_i32_noret_acq_rel(i32 %offset) nounw
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_noret_acq_rel:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2739,9 +2739,9 @@ define dso_local void @test_atomic_load_and_i64_noret_acq_rel(i64 %offset) nounw
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_noret_acq_rel:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2761,9 +2761,9 @@ define dso_local i8 @test_atomic_load_and_i8_acquire(i8 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i8_acquire:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var8
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var8
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr1_acq
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2783,9 +2783,9 @@ define dso_local i16 @test_atomic_load_and_i16_acquire(i16 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i16_acquire:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var16
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var16
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr2_acq
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2805,9 +2805,9 @@ define dso_local i32 @test_atomic_load_and_i32_acquire(i32 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_acquire:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_acq
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2827,9 +2827,9 @@ define dso_local i64 @test_atomic_load_and_i64_acquire(i64 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_acquire:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_acq
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2849,9 +2849,9 @@ define dso_local void @test_atomic_load_and_i32_noret_acquire(i32 %offset) nounw
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_noret_acquire:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_acq
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2871,9 +2871,9 @@ define dso_local void @test_atomic_load_and_i64_noret_acquire(i64 %offset) nounw
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_noret_acquire:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_acq
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2893,9 +2893,9 @@ define dso_local i8 @test_atomic_load_and_i8_monotonic(i8 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i8_monotonic:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var8
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var8
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr1_relax
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2915,9 +2915,9 @@ define dso_local i16 @test_atomic_load_and_i16_monotonic(i16 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i16_monotonic:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var16
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var16
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr2_relax
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2937,9 +2937,9 @@ define dso_local i32 @test_atomic_load_and_i32_monotonic(i32 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_monotonic:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_relax
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2959,9 +2959,9 @@ define dso_local i64 @test_atomic_load_and_i64_monotonic(i64 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_monotonic:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_relax
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -2981,9 +2981,9 @@ define dso_local void @test_atomic_load_and_i32_noret_monotonic(i32 %offset) nou
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_noret_monotonic:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_relax
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3003,9 +3003,9 @@ define dso_local void @test_atomic_load_and_i64_noret_monotonic(i64 %offset) nou
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_noret_monotonic:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_relax
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3025,9 +3025,9 @@ define dso_local i8 @test_atomic_load_and_i8_release(i8 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i8_release:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var8
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var8
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr1_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3047,9 +3047,9 @@ define dso_local i16 @test_atomic_load_and_i16_release(i16 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i16_release:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var16
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var16
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr2_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3069,9 +3069,9 @@ define dso_local i32 @test_atomic_load_and_i32_release(i32 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_release:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3091,9 +3091,9 @@ define dso_local i64 @test_atomic_load_and_i64_release(i64 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_release:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3113,9 +3113,9 @@ define dso_local void @test_atomic_load_and_i32_noret_release(i32 %offset) nounw
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_noret_release:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3135,9 +3135,9 @@ define dso_local void @test_atomic_load_and_i64_noret_release(i64 %offset) nounw
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_noret_release:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3157,9 +3157,9 @@ define dso_local i8 @test_atomic_load_and_i8_seq_cst(i8 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i8_seq_cst:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var8
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var8
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr1_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3179,9 +3179,9 @@ define dso_local i16 @test_atomic_load_and_i16_seq_cst(i16 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i16_seq_cst:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var16
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var16
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr2_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3201,9 +3201,9 @@ define dso_local i32 @test_atomic_load_and_i32_seq_cst(i32 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_seq_cst:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3223,9 +3223,9 @@ define dso_local i64 @test_atomic_load_and_i64_seq_cst(i64 %offset) nounwind {
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_seq_cst:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3245,9 +3245,9 @@ define dso_local void @test_atomic_load_and_i32_noret_seq_cst(i32 %offset) nounw
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i32_noret_seq_cst:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: adrp x1, var32
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE-ATOMICS-NEXT: mvn w0, w0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr4_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
@@ -3267,9 +3267,9 @@ define dso_local void @test_atomic_load_and_i64_noret_seq_cst(i64 %offset) nounw
; OUTLINE-ATOMICS-LABEL: test_atomic_load_and_i64_noret_seq_cst:
; OUTLINE-ATOMICS: // %bb.0:
; OUTLINE-ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: adrp x1, var64
; OUTLINE-ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE-ATOMICS-NEXT: mvn x0, x0
; OUTLINE-ATOMICS-NEXT: bl __aarch64_ldclr8_acq_rel
; OUTLINE-ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE-ATOMICS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/atomic-ops.ll b/llvm/test/CodeGen/AArch64/atomic-ops.ll
index f198affdf22a882..679065529090f0c 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,INLINE_ATOMICS
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefixes=CHECK,OUTLINE_ATOMICS
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefixes=CHECK,INLINE_ATOMICS
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefixes=CHECK,OUTLINE_ATOMICS
@var8 = dso_local global i8 0
@var16 = dso_local global i16 0
@@ -245,9 +245,9 @@ define dso_local i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
; OUTLINE_ATOMICS-LABEL: test_atomic_load_and_i8:
; OUTLINE_ATOMICS: // %bb.0:
; OUTLINE_ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE_ATOMICS-NEXT: mvn w0, w0
; OUTLINE_ATOMICS-NEXT: adrp x1, var8
; OUTLINE_ATOMICS-NEXT: add x1, x1, :lo12:var8
+; OUTLINE_ATOMICS-NEXT: mvn w0, w0
; OUTLINE_ATOMICS-NEXT: bl __aarch64_ldclr1_rel
; OUTLINE_ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE_ATOMICS-NEXT: ret
@@ -273,9 +273,9 @@ define dso_local i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
; OUTLINE_ATOMICS-LABEL: test_atomic_load_and_i16:
; OUTLINE_ATOMICS: // %bb.0:
; OUTLINE_ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE_ATOMICS-NEXT: mvn w0, w0
; OUTLINE_ATOMICS-NEXT: adrp x1, var16
; OUTLINE_ATOMICS-NEXT: add x1, x1, :lo12:var16
+; OUTLINE_ATOMICS-NEXT: mvn w0, w0
; OUTLINE_ATOMICS-NEXT: bl __aarch64_ldclr2_relax
; OUTLINE_ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE_ATOMICS-NEXT: ret
@@ -301,9 +301,9 @@ define dso_local i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
; OUTLINE_ATOMICS-LABEL: test_atomic_load_and_i32:
; OUTLINE_ATOMICS: // %bb.0:
; OUTLINE_ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE_ATOMICS-NEXT: mvn w0, w0
; OUTLINE_ATOMICS-NEXT: adrp x1, var32
; OUTLINE_ATOMICS-NEXT: add x1, x1, :lo12:var32
+; OUTLINE_ATOMICS-NEXT: mvn w0, w0
; OUTLINE_ATOMICS-NEXT: bl __aarch64_ldclr4_acq_rel
; OUTLINE_ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE_ATOMICS-NEXT: ret
@@ -329,9 +329,9 @@ define dso_local i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
; OUTLINE_ATOMICS-LABEL: test_atomic_load_and_i64:
; OUTLINE_ATOMICS: // %bb.0:
; OUTLINE_ATOMICS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; OUTLINE_ATOMICS-NEXT: mvn x0, x0
; OUTLINE_ATOMICS-NEXT: adrp x1, var64
; OUTLINE_ATOMICS-NEXT: add x1, x1, :lo12:var64
+; OUTLINE_ATOMICS-NEXT: mvn x0, x0
; OUTLINE_ATOMICS-NEXT: bl __aarch64_ldclr8_acq
; OUTLINE_ATOMICS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; OUTLINE_ATOMICS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index bf0c69e291c8fbc..d16b5786a996557 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm64-apple-iphoneos -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-iphoneos -aarch64-enable-sink-fold=true -o - %s | FileCheck %s
define i3 @sign_i3(i3 %a) {
; CHECK-LABEL: sign_i3:
@@ -244,18 +244,18 @@ define <4 x i65> @sign_4xi65(<4 x i65> %a) {
; CHECK-LABEL: sign_4xi65:
; CHECK: // %bb.0:
; CHECK-NEXT: sbfx x8, x1, #0, #1
-; CHECK-NEXT: sbfx x9, x3, #0, #1
-; CHECK-NEXT: sbfx x10, x7, #0, #1
+; CHECK-NEXT: sbfx x9, x5, #0, #1
+; CHECK-NEXT: sbfx x10, x3, #0, #1
; CHECK-NEXT: lsr x1, x8, #63
; CHECK-NEXT: orr x8, x8, #0x1
-; CHECK-NEXT: lsr x3, x9, #63
+; CHECK-NEXT: lsr x3, x10, #63
; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: sbfx x8, x5, #0, #1
-; CHECK-NEXT: lsr x7, x10, #63
-; CHECK-NEXT: orr x2, x9, #0x1
-; CHECK-NEXT: orr x6, x10, #0x1
-; CHECK-NEXT: lsr x5, x8, #63
-; CHECK-NEXT: orr x4, x8, #0x1
+; CHECK-NEXT: sbfx x8, x7, #0, #1
+; CHECK-NEXT: lsr x5, x9, #63
+; CHECK-NEXT: orr x2, x10, #0x1
+; CHECK-NEXT: orr x4, x9, #0x1
+; CHECK-NEXT: lsr x7, x8, #63
+; CHECK-NEXT: orr x6, x8, #0x1
; CHECK-NEXT: mov v0.d[1], x1
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
index b9da2b76816a99c..186d191444feb6c 100644
--- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
+++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-apple-ios7.0 -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-apple-ios7.0 -mattr=+outline-atomics -o - %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -aarch64-enable-sink-fold=true -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -mattr=+outline-atomics -aarch64-enable-sink-fold=true -o - %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
define i32 @test_return(ptr %p, i32 %oldval, i32 %newval) {
; CHECK-LABEL: test_return:
@@ -207,8 +207,7 @@ define i1 @test_conditional2(i32 %a, i32 %b, ptr %c) {
; CHECK-NEXT: b.eq LBB3_6
; CHECK-NEXT: ; %bb.8: ; %if.then
; CHECK-NEXT: ; in Loop: Header=BB3_6 Depth=1
-; CHECK-NEXT: sxtw x8, w22
-; CHECK-NEXT: str w9, [x19, x8, lsl #2]
+; CHECK-NEXT: str w9, [x19, w22, sxtw #2]
; CHECK-NEXT: bl _foo
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: b LBB3_6
@@ -250,8 +249,7 @@ define i1 @test_conditional2(i32 %a, i32 %b, ptr %c) {
; OUTLINE-ATOMICS-NEXT: b.eq LBB3_1
; OUTLINE-ATOMICS-NEXT: ; %bb.3: ; %if.then
; OUTLINE-ATOMICS-NEXT: ; in Loop: Header=BB3_1 Depth=1
-; OUTLINE-ATOMICS-NEXT: sxtw x8, w22
-; OUTLINE-ATOMICS-NEXT: str w9, [x19, x8, lsl #2]
+; OUTLINE-ATOMICS-NEXT: str w9, [x19, w22, sxtw #2]
; OUTLINE-ATOMICS-NEXT: bl _foo
; OUTLINE-ATOMICS-NEXT: mov w8, wzr
; OUTLINE-ATOMICS-NEXT: b LBB3_1
diff --git a/llvm/test/CodeGen/AArch64/loop-sink.mir b/llvm/test/CodeGen/AArch64/loop-sink.mir
index b571a3e95d27de3..36d39ffbadc29a4 100644
--- a/llvm/test/CodeGen/AArch64/loop-sink.mir
+++ b/llvm/test/CodeGen/AArch64/loop-sink.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple aarch64 -run-pass=machine-sink -sink-insts-to-avoid-spills %s -o - 2>&1 | FileCheck %s
+# RUN: llc -mtriple aarch64 -run-pass=machine-sink -sink-insts-to-avoid-spills -aarch64-enable-sink-fold=true %s -o - 2>&1 | FileCheck %s
--- |
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
@@ -328,28 +328,18 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x0
- ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 4, 0
+ ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri [[COPY1]], 1, 0
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64all = COPY [[ADDXri]]
- ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 8, 0
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64all = COPY [[ADDXri1]]
- ; CHECK-NEXT: [[ADDXri2:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 12, 0
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64all = COPY [[ADDXri2]]
- ; CHECK-NEXT: [[ADDXri3:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 16, 0
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64all = COPY [[ADDXri3]]
- ; CHECK-NEXT: [[ADDXri4:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 20, 0
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64all = COPY [[ADDXri4]]
- ; CHECK-NEXT: [[ADDXri5:%[0-9]+]]:gpr64sp = ADDXri [[COPY1]], 1, 0
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64all = COPY [[ADDXri5]]
; CHECK-NEXT: [[MOVaddrJT:%[0-9]+]]:gpr64common = MOVaddrJT target-flags(aarch64-page) %jump-table.0, target-flags(aarch64-pageoff, aarch64-nc) %jump-table.0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1..backedge:
; CHECK-NEXT: successors: %bb.9(0x09249249), %bb.2(0x76db6db7)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr64sp = PHI [[COPY7]], %bb.0, %7, %bb.9
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr64sp = PHI [[COPY2]], %bb.0, %7, %bb.9
; CHECK-NEXT: [[LDRBBui:%[0-9]+]]:gpr32 = LDRBBui [[PHI]], 0 :: (load (s8) from %ir.lsr.iv)
; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[LDRBBui]], %subreg.sub_32
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr32sp = COPY [[SUBREG_TO_REG]].sub_32
- ; CHECK-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri killed [[COPY8]], 50, 0, implicit-def $nzcv
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32sp = COPY [[SUBREG_TO_REG]].sub_32
+ ; CHECK-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri killed [[COPY3]], 50, 0, implicit-def $nzcv
; CHECK-NEXT: Bcc 8, %bb.9, implicit $nzcv
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2..backedge:
@@ -371,7 +361,7 @@ body: |
; CHECK-NEXT: successors: %bb.9(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
- ; CHECK-NEXT: $x0 = COPY [[COPY2]]
+ ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 4, 0
; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-NEXT: B %bb.9
@@ -380,7 +370,7 @@ body: |
; CHECK-NEXT: successors: %bb.9(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
- ; CHECK-NEXT: $x0 = COPY [[COPY3]]
+ ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 8, 0
; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-NEXT: B %bb.9
@@ -389,7 +379,7 @@ body: |
; CHECK-NEXT: successors: %bb.9(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
- ; CHECK-NEXT: $x0 = COPY [[COPY4]]
+ ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 12, 0
; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-NEXT: B %bb.9
@@ -398,7 +388,7 @@ body: |
; CHECK-NEXT: successors: %bb.9(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
- ; CHECK-NEXT: $x0 = COPY [[COPY5]]
+ ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 16, 0
; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-NEXT: B %bb.9
@@ -407,15 +397,15 @@ body: |
; CHECK-NEXT: successors: %bb.9(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
- ; CHECK-NEXT: $x0 = COPY [[COPY6]]
+ ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 20, 0
; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.9..backedge.backedge:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[ADDXri6:%[0-9]+]]:gpr64sp = ADDXri [[PHI]], 1, 0
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64all = COPY [[ADDXri6]]
+ ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64sp = ADDXri [[PHI]], 1, 0
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64all = COPY [[ADDXri1]]
; CHECK-NEXT: B %bb.1
bb.0 (%ir-block.bb):
successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
index ad46d32e4bf6323..0c2e3916426f577 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mattr=+sve < %s -mtriple aarch64-apple-darwin | FileCheck %s
-; RUN: llc --mattr=+sve < %s -mtriple aarch64_be-unknown-unknown | FileCheck --check-prefix CHECK-BE %s
+; RUN: llc --mattr=+sve -aarch64-enable-sink-fold=true < %s -mtriple aarch64-apple-darwin | FileCheck %s
+; RUN: llc --mattr=+sve -aarch64-enable-sink-fold=true < %s -mtriple aarch64_be-unknown-unknown | FileCheck --check-prefix CHECK-BE %s
define <4 x double> @test_ldnp_v4f64(ptr %A) {
; CHECK-LABEL: test_ldnp_v4f64:
@@ -527,14 +527,14 @@ define <4 x i65> @test_ldnp_v4i65(ptr %A) {
define <4 x i63> @test_ldnp_v4i63(ptr %A) {
; CHECK-LABEL: test_ldnp_v4i63:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ldp x8, x9, [x0]
-; CHECK-NEXT: ldp x10, x11, [x0, #16]
-; CHECK-NEXT: extr x12, x9, x8, #63
-; CHECK-NEXT: and x0, x8, #0x7fffffffffffffff
-; CHECK-NEXT: extr x9, x10, x9, #62
-; CHECK-NEXT: extr x3, x11, x10, #61
-; CHECK-NEXT: and x1, x12, #0x7fffffffffffffff
-; CHECK-NEXT: and x2, x9, #0x7fffffffffffffff
+; CHECK-NEXT: ldp x8, x9, [x0, #16]
+; CHECK-NEXT: ldp x10, x11, [x0]
+; CHECK-NEXT: extr x3, x9, x8, #61
+; CHECK-NEXT: extr x9, x11, x10, #63
+; CHECK-NEXT: extr x8, x8, x11, #62
+; CHECK-NEXT: and x0, x10, #0x7fffffffffffffff
+; CHECK-NEXT: and x1, x9, #0x7fffffffffffffff
+; CHECK-NEXT: and x2, x8, #0x7fffffffffffffff
; CHECK-NEXT: ret
;
; CHECK-BE-LABEL: test_ldnp_v4i63:
diff --git a/llvm/test/CodeGen/AArch64/optimize-imm.ll b/llvm/test/CodeGen/AArch64/optimize-imm.ll
index d06295d3b01547f..3010161bb0e6903 100644
--- a/llvm/test/CodeGen/AArch64/optimize-imm.ll
+++ b/llvm/test/CodeGen/AArch64/optimize-imm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -o - %s -mtriple=aarch64-- | FileCheck %s
+; RUN: llc -o - %s -mtriple=aarch64-- -aarch64-enable-sink-fold=true | FileCheck %s
define void @and1(i32 %a, ptr nocapture %p) {
; CHECK-LABEL: and1:
@@ -79,9 +79,9 @@ define i64 @PR33100(i64 %arg) {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: mov w8, #129 // =0x81
+; CHECK-NEXT: mov w9, #8 // =0x8
; CHECK-NEXT: eor x0, x0, x8
-; CHECK-NEXT: mov w8, #8 // =0x8
-; CHECK-NEXT: str x8, [sp, #8]
+; CHECK-NEXT: str x9, [sp, #8]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
index d86af564f2622bc..b2c7397bf575d8e 100644
--- a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
+++ b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
@@ -1,12 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc %s --mtriple aarch64 -verify-machineinstrs -o - | FileCheck %s
+; RUN: llc %s --mtriple aarch64 -verify-machineinstrs -aarch64-enable-sink-fold=true -o - | FileCheck %s
define dso_local void @jsimd_idct_ifast_neon_intrinsic(ptr nocapture readonly %dct_table, ptr nocapture readonly %coef_block, ptr nocapture readonly %output_buf, i32 %output_col) local_unnamed_addr #0 {
; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr q0, [x1, #32]
; CHECK-NEXT: ldr q1, [x1, #96]
-; CHECK-NEXT: mov w9, w3
; CHECK-NEXT: ldr q2, [x0, #32]
; CHECK-NEXT: ldr q3, [x0, #96]
; CHECK-NEXT: ldr x8, [x2, #48]
@@ -14,9 +13,9 @@ define dso_local void @jsimd_idct_ifast_neon_intrinsic(ptr nocapture readonly %d
; CHECK-NEXT: mul v1.8h, v3.8h, v1.8h
; CHECK-NEXT: add v2.8h, v0.8h, v1.8h
; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: str q2, [x8, x9]
+; CHECK-NEXT: str q2, [x8, w3, uxtw]
; CHECK-NEXT: ldr x8, [x2, #56]
-; CHECK-NEXT: str q0, [x8, x9]
+; CHECK-NEXT: str q0, [x8, w3, uxtw]
; CHECK-NEXT: ret
entry:
%add.ptr5 = getelementptr inbounds i16, ptr %coef_block, i64 16
diff --git a/llvm/test/CodeGen/AArch64/rand.ll b/llvm/test/CodeGen/AArch64/rand.ll
index 5ba356e86cba275..706774d83b18701 100644
--- a/llvm/test/CodeGen/AArch64/rand.ll
+++ b/llvm/test/CodeGen/AArch64/rand.ll
@@ -1,15 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 -mattr=+v8.5a,+rand %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+v8.5a,+rand -aarch64-enable-sink-fold=true %s -o - | FileCheck %s
define i32 @rndr(ptr %__addr) {
; CHECK-LABEL: rndr:
; CHECK: // %bb.0:
-; CHECK-NEXT: mrs x10, RNDR
-; CHECK-NEXT: mov x9, x0
-; CHECK-NEXT: cset w8, eq
-; CHECK-NEXT: str x10, [x9]
-; CHECK-NEXT: and w8, w8, #0x1
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: mrs x9, RNDR
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: cset w10, eq
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: and w0, w10, #0x1
; CHECK-NEXT: ret
%1 = tail call { i64, i1 } @llvm.aarch64.rndr()
%2 = extractvalue { i64, i1 } %1, 0
@@ -23,12 +22,11 @@ define i32 @rndr(ptr %__addr) {
define i32 @rndrrs(ptr %__addr) {
; CHECK-LABEL: rndrrs:
; CHECK: // %bb.0:
-; CHECK-NEXT: mrs x10, RNDRRS
-; CHECK-NEXT: mov x9, x0
-; CHECK-NEXT: cset w8, eq
-; CHECK-NEXT: str x10, [x9]
-; CHECK-NEXT: and w8, w8, #0x1
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: mrs x9, RNDRRS
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: cset w10, eq
+; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: and w0, w10, #0x1
; CHECK-NEXT: ret
%1 = tail call { i64, i1 } @llvm.aarch64.rndrrs()
%2 = extractvalue { i64, i1 } %1, 0
diff --git a/llvm/test/CodeGen/AArch64/shrink-constant-multiple-users.ll b/llvm/test/CodeGen/AArch64/shrink-constant-multiple-users.ll
index d787d3696c11013..f43707426ad17b1 100644
--- a/llvm/test/CodeGen/AArch64/shrink-constant-multiple-users.ll
+++ b/llvm/test/CodeGen/AArch64/shrink-constant-multiple-users.ll
@@ -1,11 +1,12 @@
-; RUN: llc -mtriple arm64-ios- %s -o - | FileCheck %s
+; RUN: llc -mtriple arm64-ios- -aarch64-enable-sink-fold=true %s -o - | FileCheck %s
; Check the -8 constant is shrunk if there are multiple users of the AND instruction.
; CHECK-LABEL: _test:
-; CHECK: and x0, x0, #0xfffffff8
-; CHECK-NEXT: add x19, x0, #10
+; CHECK: and x19, x0, #0xfffffff8
+; CHECK-NEXT: mov x0, x19
; CHECK-NEXT: bl _user
+; CHECK: add x0, x19, #10
define i64 @test(i32 %a) {
%ext = zext i32 %a to i64
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 88ca69e2f5ebe70..632fdb391053121 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -aarch64-enable-sink-fold=true < %s | FileCheck %s
target triple = "aarch64-linux"
declare i32 @use(...)
@@ -7,16 +7,15 @@ declare i32 @use(...)
define i32 @f0(i1 %c1, ptr %p) nounwind {
; CHECK-LABEL: f0:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: add x0, x1, #8
-; CHECK-NEXT: tbz w8, #0, .LBB0_2
+; CHECK-NEXT: tbz w0, #0, .LBB0_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: add x0, x1, #8
; CHECK-NEXT: bl use
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_2: // %if.else
-; CHECK-NEXT: ldr w0, [x0]
+; CHECK-NEXT: ldur w0, [x1, #8]
; CHECK-NEXT: ret
entry:
%a = getelementptr i32, ptr %p, i32 2
@@ -38,16 +37,15 @@ exit:
define i32 @f1(i1 %c1, ptr %p, i64 %i) nounwind {
; CHECK-LABEL: f1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: add x0, x1, x2
-; CHECK-NEXT: tbz w8, #0, .LBB1_2
+; CHECK-NEXT: tbz w0, #0, .LBB1_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: add x0, x1, x2
; CHECK-NEXT: bl use
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_2: // %if.else
-; CHECK-NEXT: ldr w0, [x0]
+; CHECK-NEXT: ldr w0, [x1, x2]
; CHECK-NEXT: ret
entry:
%a = getelementptr i8, ptr %p, i64 %i
@@ -102,19 +100,18 @@ exit:
}
; Address calculation cheap enough on some cores.
-define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind "target-features"="+alu-lsl-fast" {
+define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
; CHECK-LABEL: f3:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: add x0, x1, x2, lsl #2
-; CHECK-NEXT: tbz w8, #0, .LBB3_2
+; CHECK-NEXT: tbz w0, #0, .LBB3_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: add x0, x1, x2, lsl #2
; CHECK-NEXT: bl use
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB3_2: // %if.else
-; CHECK-NEXT: ldr w0, [x0]
+; CHECK-NEXT: ldr w0, [x1, x2, lsl #2]
; CHECK-NEXT: ret
entry:
%a = getelementptr i32, ptr %p, i64 %i
@@ -139,29 +136,27 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-
; CHECK-NEXT: cmp x1, #1
; CHECK-NEXT: b.lt .LBB4_9
; CHECK-NEXT: // %bb.1: // %LI.preheader
-; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: mov x23, xzr
-; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: mov x22, xzr
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov x19, x1
; CHECK-NEXT: mov x20, x0
-; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: b .LBB4_3
; CHECK-NEXT: .LBB4_2: // %LI.latch
; CHECK-NEXT: // in Loop: Header=BB4_3 Depth=1
-; CHECK-NEXT: cmp x23, x19
-; CHECK-NEXT: mov x23, x24
+; CHECK-NEXT: cmp x22, x19
+; CHECK-NEXT: mov x22, x23
; CHECK-NEXT: b.ge .LBB4_8
; CHECK-NEXT: .LBB4_3: // %LI
; CHECK-NEXT: // =>This Loop Header: Depth=1
; CHECK-NEXT: // Child Loop BB4_6 Depth 2
-; CHECK-NEXT: add x22, x20, x23, lsl #2
; CHECK-NEXT: mov x21, xzr
-; CHECK-NEXT: add x24, x23, #1
+; CHECK-NEXT: add x23, x22, #1
; CHECK-NEXT: b .LBB4_6
; CHECK-NEXT: .LBB4_4: // %if.else
; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2
-; CHECK-NEXT: ldr w0, [x22]
+; CHECK-NEXT: ldr w0, [x20, x22, lsl #2]
; CHECK-NEXT: .LBB4_5: // %LJ.latch
; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2
; CHECK-NEXT: add x8, x21, #1
@@ -177,15 +172,14 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-
; CHECK-NEXT: tbz w8, #31, .LBB4_4
; CHECK-NEXT: // %bb.7: // %if.then
; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2
-; CHECK-NEXT: mov x0, x22
+; CHECK-NEXT: add x0, x20, x22, lsl #2
; CHECK-NEXT: mov x1, x21
; CHECK-NEXT: bl use
; CHECK-NEXT: b .LBB4_5
; CHECK-NEXT: .LBB4_8:
-; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload
; CHECK-NEXT: .LBB4_9: // %exit
; CHECK-NEXT: ret
entry:
@@ -238,17 +232,16 @@ define void @f5(ptr %a, i32 %n, i32 %k) nounwind {
; CHECK-NEXT: // %bb.1: // %L.preheader
; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
; CHECK-NEXT: mov w8, #12 // =0xc
-; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: add x21, x0, #8
-; CHECK-NEXT: smaddl x8, w2, w8, x0
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w19, w1
+; CHECK-NEXT: smaddl x20, w2, w8, x0
+; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: add x21, x0, #8
; CHECK-NEXT: mov w22, #-1 // =0xffffffff
-; CHECK-NEXT: add x20, x8, #4
; CHECK-NEXT: b .LBB5_4
; CHECK-NEXT: .LBB5_2: // %if.else
; CHECK-NEXT: // in Loop: Header=BB5_4 Depth=1
-; CHECK-NEXT: ldr w0, [x20]
+; CHECK-NEXT: ldur w0, [x20, #4]
; CHECK-NEXT: .LBB5_3: // %L.latch
; CHECK-NEXT: // in Loop: Header=BB5_4 Depth=1
; CHECK-NEXT: add w22, w22, #1
@@ -261,8 +254,8 @@ define void @f5(ptr %a, i32 %n, i32 %k) nounwind {
; CHECK-NEXT: tbz w8, #31, .LBB5_2
; CHECK-NEXT: // %bb.5: // %if.then
; CHECK-NEXT: // in Loop: Header=BB5_4 Depth=1
+; CHECK-NEXT: add x0, x20, #4
; CHECK-NEXT: add w1, w22, #1
-; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: bl use
; CHECK-NEXT: b .LBB5_3
; CHECK-NEXT: .LBB5_6:
@@ -306,14 +299,13 @@ define i32 @f6(i1 %c, ptr %a, i32 %i) {
; CHECK-LABEL: f6:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
-; CHECK-NEXT: sxtw x8, w2
; CHECK-NEXT: tbz w0, #0, .LBB6_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: str wzr, [x1, x8, lsl #2]
+; CHECK-NEXT: str wzr, [x1, w2, sxtw #2]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB6_2: // %if.else
-; CHECK-NEXT: ldr w0, [x1, x8, lsl #2]
+; CHECK-NEXT: ldr w0, [x1, w2, sxtw #2]
; CHECK-NEXT: ret
entry:
%j = sext i32 %i to i64
@@ -337,14 +329,13 @@ exit:
define i8 @f7(i1 %c, ptr %a, i32 %i) {
; CHECK-LABEL: f7:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, w2
; CHECK-NEXT: tbz w0, #0, .LBB7_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: strb wzr, [x1, x8]
+; CHECK-NEXT: strb wzr, [x1, w2, uxtw]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB7_2: // %if.else
-; CHECK-NEXT: ldrb w0, [x1, x8]
+; CHECK-NEXT: ldrb w0, [x1, w2, uxtw]
; CHECK-NEXT: ret
entry:
%j = zext i32 %i to i64
@@ -368,14 +359,13 @@ exit:
define i32 @f8(i1 %c, ptr %a, i32 %i) {
; CHECK-LABEL: f8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x1, w2, sxtw #2
; CHECK-NEXT: tbz w0, #0, .LBB8_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: str wzr, [x8]
+; CHECK-NEXT: str wzr, [x1, w2, sxtw #2]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB8_2: // %if.else
-; CHECK-NEXT: ldr w0, [x8]
+; CHECK-NEXT: ldr w0, [x1, w2, sxtw #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i32, ptr %a, i32 %i
@@ -397,14 +387,13 @@ exit:
define i64 @f9(i1 %c, ptr %a, i32 %i) {
; CHECK-LABEL: f9:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, w2
; CHECK-NEXT: tbz w0, #0, .LBB9_2
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: mov x0, xzr
-; CHECK-NEXT: str xzr, [x1, x8, lsl #3]
+; CHECK-NEXT: str xzr, [x1, w2, uxtw #3]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB9_2: // %if.else
-; CHECK-NEXT: ldr x0, [x1, x8, lsl #3]
+; CHECK-NEXT: ldr x0, [x1, w2, uxtw #3]
; CHECK-NEXT: ret
entry:
%j = zext i32 %i to i64
diff --git a/llvm/test/CodeGen/AArch64/swift-async-win.ll b/llvm/test/CodeGen/AArch64/swift-async-win.ll
index 76c8583ffc165fc..94308979b07f88f 100644
--- a/llvm/test/CodeGen/AArch64/swift-async-win.ll
+++ b/llvm/test/CodeGen/AArch64/swift-async-win.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple aarch64-unknown-windows -swift-async-fp=never -filetype asm -o - %s | FileCheck %s
+; RUN: llc -mtriple aarch64-unknown-windows -swift-async-fp=never -aarch64-enable-sink-fold=true -filetype asm -o - %s | FileCheck %s
; ModuleID = '_Concurrency.ll'
source_filename = "_Concurrency.ll"
@@ -22,14 +22,13 @@ define hidden swifttailcc void @"$ss23withCheckedContinuation8function_xSS_yScCy
; CHECK-NEXT: stp x30, x29, [sp, #24] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #24
; CHECK-NEXT: str x19, [sp, #40] // 8-byte Folded Spill
-; CHECK-NEXT: sub x8, x29, #8
; CHECK-NEXT: adrp x19, __imp_swift_task_dealloc
; CHECK-NEXT: str xzr, [sp, #16]
-; CHECK-NEXT: ldr x9, [x0]
-; CHECK-NEXT: str x9, [x8]
+; CHECK-NEXT: ldr x8, [x0]
+; CHECK-NEXT: stur x8, [x29, #-8]
; CHECK-NEXT: ldr x20, [x0]
-; CHECK-NEXT: ldp x22, x0, [x9, #16]
-; CHECK-NEXT: str x20, [x8]
+; CHECK-NEXT: ldp x22, x0, [x8, #16]
+; CHECK-NEXT: stur x20, [x29, #-8]
; CHECK-NEXT: ldr x19, [x19, :lo12:__imp_swift_task_dealloc]
; CHECK-NEXT: blr x19
; CHECK-NEXT: mov x0, x22
diff --git a/llvm/test/CodeGen/AArch64/swift-async.ll b/llvm/test/CodeGen/AArch64/swift-async.ll
index 55b347d674fe820..4a3bf15b666b3fb 100644
--- a/llvm/test/CodeGen/AArch64/swift-async.ll
+++ b/llvm/test/CodeGen/AArch64/swift-async.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=arm64-apple-ios15 %s -o - | FileCheck %s --check-prefixes=CHECK-NOAUTH,CHECK
-; RUN: llc -mtriple=arm64-apple-ios15 -mcpu=apple-a13 %s -o - | FileCheck %s --check-prefixes=CHECK-NOAUTH,CHECK
-; RUN: llc -mtriple=arm64e-apple-ios15 %s -o - | FileCheck %s --check-prefixes=CHECK-AUTH,CHECK
+; RUN: llc -mtriple=arm64-apple-ios15 -aarch64-enable-sink-fold=true %s -o - | FileCheck %s --check-prefixes=CHECK-NOAUTH,CHECK
+; RUN: llc -mtriple=arm64-apple-ios15 -aarch64-enable-sink-fold=true -mcpu=apple-a13 %s -o - | FileCheck %s --check-prefixes=CHECK-NOAUTH,CHECK
+; RUN: llc -mtriple=arm64e-apple-ios15 -aarch64-enable-sink-fold=true %s -o - | FileCheck %s --check-prefixes=CHECK-AUTH,CHECK
; Important details in prologue:
; * x22 is stored just below x29
@@ -120,8 +120,7 @@ define swifttailcc ptr @context_in_func() "frame-pointer"="non-leaf" {
define swifttailcc void @write_frame_context(ptr swiftasync %ctx, ptr %newctx) "frame-pointer"="non-leaf" {
; CHECK-LABEL: write_frame_context:
-; CHECK: sub x[[ADDR:[0-9]+]], x29, #8
-; CHECK: str x0, [x[[ADDR]]]
+; CHECK: stur x0, [x29, #-8]
%ptr = call ptr @llvm.swift.async.context.addr()
store ptr %newctx, ptr %ptr
ret void
More information about the llvm-commits
mailing list