[llvm] b9c3941 - [PowerPC] Generate inlined quadword lock free atomic operations via AtomicExpand
Kai Luo via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 14 18:12:18 PDT 2021
Author: Kai Luo
Date: 2021-07-15T01:12:09Z
New Revision: b9c3941cd61de1e1b9e4f3311ddfa92394475f4b
URL: https://github.com/llvm/llvm-project/commit/b9c3941cd61de1e1b9e4f3311ddfa92394475f4b
DIFF: https://github.com/llvm/llvm-project/commit/b9c3941cd61de1e1b9e4f3311ddfa92394475f4b.diff
LOG: [PowerPC] Generate inlined quadword lock free atomic operations via AtomicExpand
This patch uses AtomicExpandPass to implement quadword lock free atomic operations. It adopts the method introduced in https://reviews.llvm.org/D47882, which expand atomic operations post RA to avoid spilling that might prevent LL/SC progress.
Reviewed By: jsji
Differential Revision: https://reviews.llvm.org/D103614
Added:
llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
llvm/test/CodeGen/PowerPC/atomics-i128.ll
Modified:
llvm/include/llvm/IR/IntrinsicsPowerPC.td
llvm/lib/CodeGen/AtomicExpandPass.cpp
llvm/lib/Target/PowerPC/CMakeLists.txt
llvm/lib/Target/PowerPC/PPC.h
llvm/lib/Target/PowerPC/PPC.td
llvm/lib/Target/PowerPC/PPCISelLowering.cpp
llvm/lib/Target/PowerPC/PPCISelLowering.h
llvm/lib/Target/PowerPC/PPCInstr64Bit.td
llvm/lib/Target/PowerPC/PPCInstrInfo.td
llvm/lib/Target/PowerPC/PPCSubtarget.h
llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
llvm/test/CodeGen/PowerPC/O3-pipeline.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index b021b43afe59..eb9126149019 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1600,3 +1600,23 @@ let TargetPrefix = "ppc" in {
Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
}
+//===----------------------------------------------------------------------===//
+// PowerPC Atomic Intrinsic Definitions.
+let TargetPrefix = "ppc" in {
+ class AtomicRMW128Intrinsic
+ : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+ [llvm_ptr_ty, llvm_i64_ty, llvm_i64_ty],
+ [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
+ def int_ppc_atomicrmw_xchg_i128 : AtomicRMW128Intrinsic;
+ def int_ppc_atomicrmw_add_i128 : AtomicRMW128Intrinsic;
+ def int_ppc_atomicrmw_sub_i128 : AtomicRMW128Intrinsic;
+ def int_ppc_atomicrmw_and_i128 : AtomicRMW128Intrinsic;
+ def int_ppc_atomicrmw_or_i128 : AtomicRMW128Intrinsic;
+ def int_ppc_atomicrmw_xor_i128 : AtomicRMW128Intrinsic;
+ def int_ppc_atomicrmw_nand_i128 : AtomicRMW128Intrinsic;
+ def int_ppc_cmpxchg_i128 : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+ [llvm_ptr_ty,
+ llvm_i64_ty, llvm_i64_ty,
+ llvm_i64_ty, llvm_i64_ty],
+ [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
+}
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index ec996b045219..125a3be585cb 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -689,6 +689,8 @@ static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
if (PMV.ValueType == PMV.WordType) {
PMV.AlignedAddr = Addr;
PMV.AlignedAddrAlignment = AddrAlign;
+ PMV.ShiftAmt = ConstantInt::get(PMV.ValueType, 0);
+ PMV.Mask = ConstantInt::get(PMV.ValueType, ~0);
return PMV;
}
diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index 6da1601c169d..195eebf30862 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_target(PowerPCCodeGen
PPCCallingConv.cpp
PPCCCState.cpp
PPCCTRLoops.cpp
+ PPCExpandAtomicPseudoInsts.cpp
PPCHazardRecognizers.cpp
PPCInstrInfo.cpp
PPCISelDAGToDAG.cpp
diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h
index 8339046e8c26..7235a878e38b 100644
--- a/llvm/lib/Target/PowerPC/PPC.h
+++ b/llvm/lib/Target/PowerPC/PPC.h
@@ -52,6 +52,7 @@ FunctionPass *createPPCCTRLoops();
FunctionPass *createPPCBoolRetToIntPass();
FunctionPass *createPPCExpandISELPass();
FunctionPass *createPPCPreEmitPeepholePass();
+ FunctionPass *createPPCExpandAtomicPseudoPass();
void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
AsmPrinter &AP);
bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
@@ -75,6 +76,7 @@ FunctionPass *createPPCCTRLoops();
void initializePPCPreEmitPeepholePass(PassRegistry &);
void initializePPCTLSDynamicCallPass(PassRegistry &);
void initializePPCMIPeepholePass(PassRegistry&);
+ void initializePPCExpandAtomicPseudoPass(PassRegistry &);
extern char &PPCVSXFMAMutateID;
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 36d119c7ee16..96746d10c002 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -161,6 +161,9 @@ def FeatureDirectMove :
def FeaturePartwordAtomic : SubtargetFeature<"partword-atomics",
"HasPartwordAtomics", "true",
"Enable l[bh]arx and st[bh]cx.">;
+def FeatureQuadwordAtomic : SubtargetFeature<"quadword-atomics",
+ "HasQuadwordAtomics", "true",
+ "Enable lqarx and stqcx.">;
def FeatureInvariantFunctionDescriptors :
SubtargetFeature<"invariant-function-descriptors",
"HasInvariantFunctionDescriptors", "true",
@@ -331,6 +334,7 @@ def ProcessorFeatures {
FeatureDirectMove,
FeatureICBT,
FeaturePartwordAtomic,
+ FeatureQuadwordAtomic,
FeaturePredictableSelectIsExpensive
];
diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
new file mode 100644
index 000000000000..9daef26ede47
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
@@ -0,0 +1,306 @@
+//===-- PPCExpandAtomicPseudoInsts.cpp - Expand atomic pseudo instrs. -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands atomic pseudo instructions into
+// target instructions post RA. With such method, LL/SC loop is considered as
+// a whole blob and make spilling unlikely happens in the LL/SC loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCTargetMachine.h"
+
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-atomic-expand"
+
+namespace {
+
+class PPCExpandAtomicPseudo : public MachineFunctionPass {
+public:
+ const PPCInstrInfo *TII;
+ const PPCRegisterInfo *TRI;
+ static char ID;
+
+ PPCExpandAtomicPseudo() : MachineFunctionPass(ID) {
+ initializePPCExpandAtomicPseudoPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ bool expandMI(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI);
+ bool expandAtomicRMW128(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI);
+ bool expandAtomicCmpSwap128(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI);
+};
+
+static void PairedCopy(const PPCInstrInfo *TII, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ Register Dest0, Register Dest1, Register Src0,
+ Register Src1) {
+ const MCInstrDesc &OR = TII->get(PPC::OR8);
+ const MCInstrDesc &XOR = TII->get(PPC::XOR8);
+ if (Dest0 == Src1 && Dest1 == Src0) {
+ // The most tricky case, swapping values.
+ BuildMI(MBB, MBBI, DL, XOR, Dest0).addReg(Dest0).addReg(Dest1);
+ BuildMI(MBB, MBBI, DL, XOR, Dest1).addReg(Dest0).addReg(Dest1);
+ BuildMI(MBB, MBBI, DL, XOR, Dest0).addReg(Dest0).addReg(Dest1);
+ } else if (Dest0 != Src0 || Dest1 != Src1) {
+ if (Dest0 == Src1 || Dest1 != Src0) {
+ BuildMI(MBB, MBBI, DL, OR, Dest1).addReg(Src1).addReg(Src1);
+ BuildMI(MBB, MBBI, DL, OR, Dest0).addReg(Src0).addReg(Src0);
+ } else {
+ BuildMI(MBB, MBBI, DL, OR, Dest0).addReg(Src0).addReg(Src0);
+ BuildMI(MBB, MBBI, DL, OR, Dest1).addReg(Src1).addReg(Src1);
+ }
+ }
+}
+
+bool PPCExpandAtomicPseudo::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ TII = static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ TRI = &TII->getRegisterInfo();
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+ MachineBasicBlock &MBB = *I;
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), MBBE = MBB.end();
+ MBBI != MBBE;) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+ Changed |= expandMI(MBB, MI, NMBBI);
+ MBBI = NMBBI;
+ }
+ }
+ if (Changed)
+ MF.RenumberBlocks();
+ return Changed;
+}
+
+bool PPCExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI) {
+ switch (MI.getOpcode()) {
+ case PPC::ATOMIC_SWAP_I128:
+ case PPC::ATOMIC_LOAD_ADD_I128:
+ case PPC::ATOMIC_LOAD_SUB_I128:
+ case PPC::ATOMIC_LOAD_XOR_I128:
+ case PPC::ATOMIC_LOAD_NAND_I128:
+ case PPC::ATOMIC_LOAD_AND_I128:
+ case PPC::ATOMIC_LOAD_OR_I128:
+ return expandAtomicRMW128(MBB, MI, NMBBI);
+ case PPC::ATOMIC_CMP_SWAP_I128:
+ return expandAtomicCmpSwap128(MBB, MI, NMBBI);
+ default:
+ return false;
+ }
+}
+
+bool PPCExpandAtomicPseudo::expandAtomicRMW128(
+ MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI) {
+ const MCInstrDesc &LL = TII->get(PPC::LQARX);
+ const MCInstrDesc &SC = TII->get(PPC::STQCX);
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ const BasicBlock *BB = MBB.getBasicBlock();
+ // Create layout of control flow.
+ MachineFunction::iterator MFI = ++MBB.getIterator();
+ MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(MFI, LoopMBB);
+ MF->insert(MFI, ExitMBB);
+ ExitMBB->splice(ExitMBB->begin(), &MBB, std::next(MI.getIterator()),
+ MBB.end());
+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ MBB.addSuccessor(LoopMBB);
+
+ // For non-min/max operations, control flow is kinda like:
+ // MBB:
+ // ...
+ // LoopMBB:
+ // lqarx in, ptr
+ // addc out.sub_x1, in.sub_x1, op.sub_x1
+ // adde out.sub_x0, in.sub_x0, op.sub_x0
+ // stqcx out, ptr
+ // bne- LoopMBB
+ // ExitMBB:
+ // ...
+ Register Old = MI.getOperand(0).getReg();
+ Register OldHi = TRI->getSubReg(Old, PPC::sub_gp8_x0);
+ Register OldLo = TRI->getSubReg(Old, PPC::sub_gp8_x1);
+ Register Scratch = MI.getOperand(1).getReg();
+ Register ScratchHi = TRI->getSubReg(Scratch, PPC::sub_gp8_x0);
+ Register ScratchLo = TRI->getSubReg(Scratch, PPC::sub_gp8_x1);
+ Register RA = MI.getOperand(2).getReg();
+ Register RB = MI.getOperand(3).getReg();
+ Register IncrLo = MI.getOperand(4).getReg();
+ Register IncrHi = MI.getOperand(5).getReg();
+ unsigned RMWOpcode = MI.getOpcode();
+
+ MachineBasicBlock *CurrentMBB = LoopMBB;
+ BuildMI(CurrentMBB, DL, LL, Old).addReg(RA).addReg(RB);
+
+ switch (RMWOpcode) {
+ case PPC::ATOMIC_SWAP_I128:
+ PairedCopy(TII, *CurrentMBB, CurrentMBB->end(), DL, ScratchHi, ScratchLo,
+ IncrHi, IncrLo);
+ break;
+ case PPC::ATOMIC_LOAD_ADD_I128:
+ BuildMI(CurrentMBB, DL, TII->get(PPC::ADDC8), ScratchLo)
+ .addReg(IncrLo)
+ .addReg(OldLo);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::ADDE8), ScratchHi)
+ .addReg(IncrHi)
+ .addReg(OldHi);
+ break;
+ case PPC::ATOMIC_LOAD_SUB_I128:
+ BuildMI(CurrentMBB, DL, TII->get(PPC::SUBFC8), ScratchLo)
+ .addReg(IncrLo)
+ .addReg(OldLo);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::SUBFE8), ScratchHi)
+ .addReg(IncrHi)
+ .addReg(OldHi);
+ break;
+
+#define TRIVIAL_ATOMICRMW(Opcode, Instr) \
+ case Opcode: \
+ BuildMI(CurrentMBB, DL, TII->get((Instr)), ScratchLo) \
+ .addReg(IncrLo) \
+ .addReg(OldLo); \
+ BuildMI(CurrentMBB, DL, TII->get((Instr)), ScratchHi) \
+ .addReg(IncrHi) \
+ .addReg(OldHi); \
+ break
+
+ TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_OR_I128, PPC::OR8);
+ TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_XOR_I128, PPC::XOR8);
+ TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_AND_I128, PPC::AND8);
+ TRIVIAL_ATOMICRMW(PPC::ATOMIC_LOAD_NAND_I128, PPC::NAND8);
+#undef TRIVIAL_ATOMICRMW
+ default:
+ llvm_unreachable("Unhandled atomic RMW operation");
+ }
+ BuildMI(CurrentMBB, DL, SC).addReg(Scratch).addReg(RA).addReg(RB);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE)
+ .addReg(PPC::CR0)
+ .addMBB(LoopMBB);
+ CurrentMBB->addSuccessor(LoopMBB);
+ CurrentMBB->addSuccessor(ExitMBB);
+ recomputeLiveIns(*LoopMBB);
+ recomputeLiveIns(*ExitMBB);
+ NMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128(
+ MachineBasicBlock &MBB, MachineInstr &MI,
+ MachineBasicBlock::iterator &NMBBI) {
+ const MCInstrDesc &LL = TII->get(PPC::LQARX);
+ const MCInstrDesc &SC = TII->get(PPC::STQCX);
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
+ const BasicBlock *BB = MBB.getBasicBlock();
+ Register Old = MI.getOperand(0).getReg();
+ Register OldHi = TRI->getSubReg(Old, PPC::sub_gp8_x0);
+ Register OldLo = TRI->getSubReg(Old, PPC::sub_gp8_x1);
+ Register Scratch = MI.getOperand(1).getReg();
+ Register ScratchHi = TRI->getSubReg(Scratch, PPC::sub_gp8_x0);
+ Register ScratchLo = TRI->getSubReg(Scratch, PPC::sub_gp8_x1);
+ Register RA = MI.getOperand(2).getReg();
+ Register RB = MI.getOperand(3).getReg();
+ Register CmpLo = MI.getOperand(4).getReg();
+ Register CmpHi = MI.getOperand(5).getReg();
+ Register NewLo = MI.getOperand(6).getReg();
+ Register NewHi = MI.getOperand(7).getReg();
+ // Create layout of control flow.
+ // loop:
+ // old = lqarx ptr
+ // <compare old, cmp>
+ // bne 0, fail
+ // succ:
+ // stqcx new ptr
+ // bne 0, loop
+ // b exit
+ // fail:
+ // stqcx old ptr
+ // exit:
+ // ....
+ MachineFunction::iterator MFI = ++MBB.getIterator();
+ MachineBasicBlock *LoopCmpMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *CmpSuccMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *CmpFailMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(MFI, LoopCmpMBB);
+ MF->insert(MFI, CmpSuccMBB);
+ MF->insert(MFI, CmpFailMBB);
+ MF->insert(MFI, ExitMBB);
+ ExitMBB->splice(ExitMBB->begin(), &MBB, std::next(MI.getIterator()),
+ MBB.end());
+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ MBB.addSuccessor(LoopCmpMBB);
+ // Build loop.
+ MachineBasicBlock *CurrentMBB = LoopCmpMBB;
+ BuildMI(CurrentMBB, DL, LL, Old).addReg(RA).addReg(RB);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::XOR8), ScratchLo)
+ .addReg(OldLo)
+ .addReg(CmpLo);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::XOR8), ScratchHi)
+ .addReg(OldHi)
+ .addReg(CmpHi);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::OR8_rec), ScratchLo)
+ .addReg(ScratchLo)
+ .addReg(ScratchHi);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE)
+ .addReg(PPC::CR0)
+ .addMBB(CmpFailMBB);
+ CurrentMBB->addSuccessor(CmpSuccMBB);
+ CurrentMBB->addSuccessor(CmpFailMBB);
+ // Build succ.
+ CurrentMBB = CmpSuccMBB;
+ PairedCopy(TII, *CurrentMBB, CurrentMBB->end(), DL, ScratchHi, ScratchLo,
+ NewHi, NewLo);
+ BuildMI(CurrentMBB, DL, SC).addReg(Scratch).addReg(RA).addReg(RB);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+ .addImm(PPC::PRED_NE)
+ .addReg(PPC::CR0)
+ .addMBB(LoopCmpMBB);
+ BuildMI(CurrentMBB, DL, TII->get(PPC::B)).addMBB(ExitMBB);
+ CurrentMBB->addSuccessor(LoopCmpMBB);
+ CurrentMBB->addSuccessor(ExitMBB);
+ CurrentMBB = CmpFailMBB;
+ BuildMI(CurrentMBB, DL, SC).addReg(Old).addReg(RA).addReg(RB);
+ CurrentMBB->addSuccessor(ExitMBB);
+
+ recomputeLiveIns(*LoopCmpMBB);
+ recomputeLiveIns(*CmpSuccMBB);
+ recomputeLiveIns(*CmpFailMBB);
+ recomputeLiveIns(*ExitMBB);
+ NMBBI = MBB.end();
+ MI.eraseFromParent();
+ return true;
+}
+
+} // namespace
+
+INITIALIZE_PASS(PPCExpandAtomicPseudo, DEBUG_TYPE, "PowerPC Expand Atomic",
+ false, false)
+
+char PPCExpandAtomicPseudo::ID = 0;
+FunctionPass *llvm::createPPCExpandAtomicPseudoPass() {
+ return new PPCExpandAtomicPseudo();
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 19fab225df2d..8ee938e01d79 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -121,6 +121,11 @@ cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
cl::desc("use absolute jump tables on ppc"), cl::Hidden);
+static cl::opt<bool> EnableQuadwordAtomics(
+ "ppc-quadword-atomics",
+ cl::desc("enable quadword lock-free atomic operations"), cl::init(false),
+ cl::Hidden);
+
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
@@ -1281,6 +1286,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
}
+ if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics())
+ setMaxAtomicSizeInBitsSupported(128);
+
setBooleanContents(ZeroOrOneBooleanContent);
if (Subtarget.hasAltivec()) {
@@ -12628,6 +12636,17 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
} else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
return emitProbedAlloca(MI, BB);
+ } else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) {
+ DebugLoc DL = MI.getDebugLoc();
+ Register Src = MI.getOperand(2).getReg();
+ Register Lo = MI.getOperand(0).getReg();
+ Register Hi = MI.getOperand(1).getReg();
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
+ .addDef(Lo)
+ .addUse(Src, 0, PPC::sub_gp8_x1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
+ .addDef(Hi)
+ .addUse(Src, 0, PPC::sub_gp8_x0);
} else {
llvm_unreachable("Unexpected instr type to insert");
}
@@ -16042,6 +16061,22 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
+ case Intrinsic::ppc_atomicrmw_xchg_i128:
+ case Intrinsic::ppc_atomicrmw_add_i128:
+ case Intrinsic::ppc_atomicrmw_sub_i128:
+ case Intrinsic::ppc_atomicrmw_nand_i128:
+ case Intrinsic::ppc_atomicrmw_and_i128:
+ case Intrinsic::ppc_atomicrmw_or_i128:
+ case Intrinsic::ppc_atomicrmw_xor_i128:
+ case Intrinsic::ppc_cmpxchg_i128:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i128;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = Align(16);
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
+ return true;
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_altivec_lvebx:
@@ -17457,3 +17492,102 @@ CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,
return CC_PPC64_ELF_FIS;
}
}
+
+TargetLowering::AtomicExpansionKind
+PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ if (AI->isFloatingPointOperation())
+ return AtomicExpansionKind::None;
+ unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+ if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128)
+ return AtomicExpansionKind::MaskedIntrinsic;
+ return AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
+ unsigned Size = AI->getPointerOperand()
+ ->getType()
+ ->getPointerElementType()
+ ->getPrimitiveSizeInBits();
+ if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128)
+ return AtomicExpansionKind::MaskedIntrinsic;
+ return AtomicExpansionKind::None;
+}
+
+static Intrinsic::ID
+getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
+ switch (BinOp) {
+ default:
+ llvm_unreachable("Unexpected AtomicRMW BinOp");
+ case AtomicRMWInst::Xchg:
+ return Intrinsic::ppc_atomicrmw_xchg_i128;
+ case AtomicRMWInst::Add:
+ return Intrinsic::ppc_atomicrmw_add_i128;
+ case AtomicRMWInst::Sub:
+ return Intrinsic::ppc_atomicrmw_sub_i128;
+ case AtomicRMWInst::And:
+ return Intrinsic::ppc_atomicrmw_and_i128;
+ case AtomicRMWInst::Or:
+ return Intrinsic::ppc_atomicrmw_or_i128;
+ case AtomicRMWInst::Xor:
+ return Intrinsic::ppc_atomicrmw_xor_i128;
+ case AtomicRMWInst::Nand:
+ return Intrinsic::ppc_atomicrmw_nand_i128;
+ }
+}
+
+Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
+ IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
+ Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
+ assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
+ "Only support quadword now");
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Type *ValTy = cast<PointerType>(AlignedAddr->getType())->getElementType();
+ assert(ValTy->getPrimitiveSizeInBits() == 128);
+ Function *RMW = Intrinsic::getDeclaration(
+ M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation()));
+ Type *Int64Ty = Type::getInt64Ty(M->getContext());
+ Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo");
+ Value *IncrHi =
+ Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi");
+ Value *Addr =
+ Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext()));
+ Value *LoHi = Builder.CreateCall(RMW, {Addr, IncrLo, IncrHi});
+ Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+ Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+ Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ return Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+}
+
+Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
+ IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
+ Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
+ assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
+ "Only support quadword now");
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Type *ValTy = cast<PointerType>(AlignedAddr->getType())->getElementType();
+ assert(ValTy->getPrimitiveSizeInBits() == 128);
+ Function *IntCmpXchg =
+ Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128);
+ Type *Int64Ty = Type::getInt64Ty(M->getContext());
+ Value *CmpLo = Builder.CreateTrunc(CmpVal, Int64Ty, "cmp_lo");
+ Value *CmpHi =
+ Builder.CreateTrunc(Builder.CreateLShr(CmpVal, 64), Int64Ty, "cmp_hi");
+ Value *NewLo = Builder.CreateTrunc(NewVal, Int64Ty, "new_lo");
+ Value *NewHi =
+ Builder.CreateTrunc(Builder.CreateLShr(NewVal, 64), Int64Ty, "new_hi");
+ Value *Addr =
+ Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext()));
+ emitLeadingFence(Builder, CI, Ord);
+ Value *LoHi =
+ Builder.CreateCall(IntCmpXchg, {Addr, CmpLo, CmpHi, NewLo, NewHi});
+ emitTrailingFence(Builder, CI, Ord);
+ Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+ Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+ Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ return Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 19f9eff4c3d5..87579bad118f 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -876,6 +876,23 @@ namespace llvm {
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
+ TargetLowering::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+ TargetLowering::AtomicExpansionKind
+ shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+ Value *emitMaskedAtomicRMWIntrinsic(IRBuilderBase &Builder,
+ AtomicRMWInst *AI, Value *AlignedAddr,
+ Value *Incr, Value *Mask,
+ Value *ShiftAmt,
+ AtomicOrdering Ord) const override;
+ Value *emitMaskedAtomicCmpXchgIntrinsic(IRBuilderBase &Builder,
+ AtomicCmpXchgInst *CI,
+ Value *AlignedAddr, Value *CmpVal,
+ Value *NewVal, Value *Mask,
+ AtomicOrdering Ord) const override;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 496f76b69d2e..931b7790b1d8 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -304,6 +304,88 @@ def STQCX : XForm_1_memOp<31, 182, (outs), (ins g8prc:$RSp, memrr:$dst),
isPPC64, isRecordForm;
}
+def SPLIT_QUADWORD : PPCCustomInserterPseudo<(outs g8rc:$lo, g8rc:$hi),
+ (ins g8prc:$src),
+ "#SPLIT_QUADWORD", []>;
+class AtomicRMW128<string asmstr>
+ : PPCPostRAExpPseudo<(outs g8prc:$RTp, g8prc:$scratch),
+ (ins memrr:$ptr, g8rc:$incr_lo, g8rc:$incr_hi),
+ asmstr, []>;
+// We have to keep values in MI's uses during LL/SC looping as they are,
+// so set both $RTp and $scratch earlyclobber.
+let mayStore = 1, mayLoad = 1,
+ Defs = [CR0],
+ Constraints = "@earlyclobber $scratch, at earlyclobber $RTp" in {
+// Atomic pseudo instructions expanded post-ra.
+def ATOMIC_SWAP_I128 : AtomicRMW128<"#ATOMIC_SWAP_I128">;
+def ATOMIC_LOAD_ADD_I128 : AtomicRMW128<"#ATOMIC_LOAD_ADD_I128">;
+def ATOMIC_LOAD_SUB_I128 : AtomicRMW128<"#ATOMIC_LOAD_SUB_I128">;
+def ATOMIC_LOAD_AND_I128 : AtomicRMW128<"#ATOMIC_LOAD_AND_I128">;
+def ATOMIC_LOAD_XOR_I128 : AtomicRMW128<"#ATOMIC_LOAD_XOR_I128">;
+def ATOMIC_LOAD_OR_I128 : AtomicRMW128<"#ATOMIC_LOAD_OR_I128">;
+def ATOMIC_LOAD_NAND_I128 : AtomicRMW128<"#ATOMIC_LOAD_NAND_I128">;
+
+def ATOMIC_CMP_SWAP_I128 : PPCPostRAExpPseudo<
+ (outs g8prc:$RTp, g8prc:$scratch),
+ (ins memrr:$ptr, g8rc:$cmp_lo, g8rc:$cmp_hi,
+ g8rc:$new_lo, g8rc:$new_hi),
+ "#ATOMIC_CMP_SWAP_I128", []>;
+}
+
+def : Pat<(int_ppc_atomicrmw_add_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_ADD_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_sub_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_SUB_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_xor_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_XOR_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_and_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_AND_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_nand_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_NAND_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_or_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_LOAD_OR_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_atomicrmw_xchg_i128 ForceXForm:$ptr,
+ i64:$incr_lo,
+ i64:$incr_hi),
+ (SPLIT_QUADWORD (ATOMIC_SWAP_I128 memrr:$ptr,
+ g8rc:$incr_lo,
+ g8rc:$incr_hi))>;
+def : Pat<(int_ppc_cmpxchg_i128 ForceXForm:$ptr,
+ i64:$cmp_lo,
+ i64:$cmp_hi,
+ i64:$new_lo,
+ i64:$new_hi),
+ (SPLIT_QUADWORD (ATOMIC_CMP_SWAP_I128
+ memrr:$ptr,
+ g8rc:$cmp_lo,
+ g8rc:$cmp_hi,
+ g8rc:$new_lo,
+ g8rc:$new_hi))>;
+
let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
"stdat $rS, $rA, $FC", IIC_LdStStore>, isPPC64,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index d97881fe818b..1658618800e9 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1170,6 +1170,7 @@ def IsE500 : Predicate<"Subtarget->isE500()">;
def HasSPE : Predicate<"Subtarget->hasSPE()">;
def HasICBT : Predicate<"Subtarget->hasICBT()">;
def HasPartwordAtomics : Predicate<"Subtarget->hasPartwordAtomics()">;
+def HasQuadwordAtomics : Predicate<"Subtarget->hasQuadwordAtomics()">;
def NoNaNsFPMath
: Predicate<"Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
def NaNsFPMath
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 17139a85a812..e916b0c02000 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -139,6 +139,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
bool HasICBT;
bool HasInvariantFunctionDescriptors;
bool HasPartwordAtomics;
+ bool HasQuadwordAtomics;
bool HasDirectMove;
bool HasHTM;
bool HasFloat128;
@@ -302,6 +303,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
bool usePPCPreRASchedStrategy() const { return UsePPCPreRASchedStrategy; }
bool usePPCPostRASchedStrategy() const { return UsePPCPostRASchedStrategy; }
bool hasPartwordAtomics() const { return HasPartwordAtomics; }
+ bool hasQuadwordAtomics() const { return HasQuadwordAtomics; }
bool hasDirectMove() const { return HasDirectMove; }
Align getPlatformStackAlignment() const {
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index a29c7d9e20e8..82717300a480 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -123,6 +123,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
initializePPCTLSDynamicCallPass(PR);
initializePPCMIPeepholePass(PR);
initializePPCLowerMASSVEntriesPass(PR);
+ initializePPCExpandAtomicPseudoPass(PR);
initializeGlobalISel(PR);
}
@@ -539,6 +540,10 @@ void PPCPassConfig::addPreEmitPass() {
}
void PPCPassConfig::addPreEmitPass2() {
+ // Schedule the expansion of AMOs at the last possible moment, avoiding the
+ // possibility for other passes to break the requirements for forward
+ // progress in the LL/SC block.
+ addPass(createPPCExpandAtomicPseudoPass());
// Must run branch selection immediately preceding the asm printer.
addPass(createPPCBranchSelectionPass());
}
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 308072df3e69..aee62db60390 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -197,6 +197,7 @@
; CHECK-NEXT: Contiguously Lay Out Funclets
; CHECK-NEXT: StackMap Liveness Analysis
; CHECK-NEXT: Live DEBUG_VALUE analysis
+; CHECK-NEXT: PowerPC Expand Atomic
; CHECK-NEXT: PowerPC Branch Selector
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
diff --git a/llvm/test/CodeGen/PowerPC/atomics-i128.ll b/llvm/test/CodeGen/PowerPC/atomics-i128.ll
new file mode 100644
index 000000000000..9647288ea6bc
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/atomics-i128.ll
@@ -0,0 +1,452 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown -mcpu=pwr8 \
+; RUN: -ppc-asm-full-reg-names -ppc-quadword-atomics \
+; RUN: -ppc-track-subreg-liveness < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown -mcpu=pwr7 \
+; RUN: -ppc-asm-full-reg-names -ppc-quadword-atomics \
+; RUN: -ppc-track-subreg-liveness < %s | FileCheck --check-prefix=PWR7 %s
+
+
+define i128 @swap(i128* %a, i128 %x) {
+; CHECK-LABEL: swap:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sync
+; CHECK-NEXT: .LBB0_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r6, 0, r3
+; CHECK-NEXT: mr r9, r5
+; CHECK-NEXT: mr r8, r4
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: bne cr0, .LBB0_1
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r6
+; CHECK-NEXT: mr r4, r7
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: swap:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: sync
+; PWR7-NEXT: bl __sync_lock_test_and_set_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = atomicrmw xchg i128* %a, i128 %x seq_cst, align 16
+ ret i128 %0
+}
+
+define i128 @add(i128* %a, i128 %x) {
+; CHECK-LABEL: add:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sync
+; CHECK-NEXT: .LBB1_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r6, 0, r3
+; CHECK-NEXT: addc r9, r5, r7
+; CHECK-NEXT: adde r8, r4, r6
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: bne cr0, .LBB1_1
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r6
+; CHECK-NEXT: mr r4, r7
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: add:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: sync
+; PWR7-NEXT: bl __sync_fetch_and_add_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = atomicrmw add i128* %a, i128 %x seq_cst, align 16
+ ret i128 %0
+}
+
+define i128 @sub(i128* %a, i128 %x) {
+; CHECK-LABEL: sub:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sync
+; CHECK-NEXT: .LBB2_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r6, 0, r3
+; CHECK-NEXT: subc r9, r7, r5
+; CHECK-NEXT: subfe r8, r4, r6
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: bne cr0, .LBB2_1
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r6
+; CHECK-NEXT: mr r4, r7
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: sub:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: sync
+; PWR7-NEXT: bl __sync_fetch_and_sub_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = atomicrmw sub i128* %a, i128 %x seq_cst, align 16
+ ret i128 %0
+}
+
+define i128 @and(i128* %a, i128 %x) {
+; CHECK-LABEL: and:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sync
+; CHECK-NEXT: .LBB3_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r6, 0, r3
+; CHECK-NEXT: and r9, r5, r7
+; CHECK-NEXT: and r8, r4, r6
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: bne cr0, .LBB3_1
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r6
+; CHECK-NEXT: mr r4, r7
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: and:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: sync
+; PWR7-NEXT: bl __sync_fetch_and_and_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = atomicrmw and i128* %a, i128 %x seq_cst, align 16
+ ret i128 %0
+}
+
+define i128 @or(i128* %a, i128 %x) {
+; CHECK-LABEL: or:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sync
+; CHECK-NEXT: .LBB4_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r6, 0, r3
+; CHECK-NEXT: or r9, r5, r7
+; CHECK-NEXT: or r8, r4, r6
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: bne cr0, .LBB4_1
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r6
+; CHECK-NEXT: mr r4, r7
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: or:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: sync
+; PWR7-NEXT: bl __sync_fetch_and_or_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = atomicrmw or i128* %a, i128 %x seq_cst, align 16
+ ret i128 %0
+}
+
+define i128 @xor(i128* %a, i128 %x) {
+; CHECK-LABEL: xor:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sync
+; CHECK-NEXT: .LBB5_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r6, 0, r3
+; CHECK-NEXT: xor r9, r5, r7
+; CHECK-NEXT: xor r8, r4, r6
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: bne cr0, .LBB5_1
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r6
+; CHECK-NEXT: mr r4, r7
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: xor:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: sync
+; PWR7-NEXT: bl __sync_fetch_and_xor_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = atomicrmw xor i128* %a, i128 %x seq_cst, align 16
+ ret i128 %0
+}
+
+define i128 @nand(i128* %a, i128 %x) {
+; CHECK-LABEL: nand:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sync
+; CHECK-NEXT: .LBB6_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r6, 0, r3
+; CHECK-NEXT: nand r9, r5, r7
+; CHECK-NEXT: nand r8, r4, r6
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: bne cr0, .LBB6_1
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r6
+; CHECK-NEXT: mr r4, r7
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: nand:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: sync
+; PWR7-NEXT: bl __sync_fetch_and_nand_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = atomicrmw nand i128* %a, i128 %x seq_cst, align 16
+ ret i128 %0
+}
+
+;; CmpXchg
+define i128 @cas_weak_acquire_acquire(i128* %a, i128 %cmp, i128 %new) {
+; CHECK-LABEL: cas_weak_acquire_acquire:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: .LBB7_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r8, 0, r3
+; CHECK-NEXT: xor r11, r9, r5
+; CHECK-NEXT: xor r10, r8, r4
+; CHECK-NEXT: or. r11, r11, r10
+; CHECK-NEXT: bne cr0, .LBB7_3
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: mr r11, r7
+; CHECK-NEXT: mr r10, r6
+; CHECK-NEXT: stqcx. r10, 0, r3
+; CHECK-NEXT: bne cr0, .LBB7_1
+; CHECK-NEXT: b .LBB7_4
+; CHECK-NEXT: .LBB7_3: # %entry
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: .LBB7_4: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r8
+; CHECK-NEXT: mr r4, r9
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: cas_weak_acquire_acquire:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: bl __sync_val_compare_and_swap_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = cmpxchg weak i128* %a, i128 %cmp, i128 %new acquire acquire
+ %1 = extractvalue { i128, i1 } %0, 0
+ ret i128 %1
+}
+
+define i128 @cas_weak_release_monotonic(i128* %a, i128 %cmp, i128 %new) {
+; CHECK-LABEL: cas_weak_release_monotonic:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: .LBB8_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r8, 0, r3
+; CHECK-NEXT: xor r11, r9, r5
+; CHECK-NEXT: xor r10, r8, r4
+; CHECK-NEXT: or. r11, r11, r10
+; CHECK-NEXT: bne cr0, .LBB8_3
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: mr r11, r7
+; CHECK-NEXT: mr r10, r6
+; CHECK-NEXT: stqcx. r10, 0, r3
+; CHECK-NEXT: bne cr0, .LBB8_1
+; CHECK-NEXT: b .LBB8_4
+; CHECK-NEXT: .LBB8_3: # %entry
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: .LBB8_4: # %entry
+; CHECK-NEXT: mr r3, r8
+; CHECK-NEXT: mr r4, r9
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: cas_weak_release_monotonic:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: bl __sync_val_compare_and_swap_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = cmpxchg weak i128* %a, i128 %cmp, i128 %new release monotonic
+ %1 = extractvalue { i128, i1 } %0, 0
+ ret i128 %1
+}
+
+define i128 @cas_sc_sc(i128* %a, i128 %cmp, i128 %new) {
+; CHECK-LABEL: cas_sc_sc:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sync
+; CHECK-NEXT: .LBB9_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r8, 0, r3
+; CHECK-NEXT: xor r11, r9, r5
+; CHECK-NEXT: xor r10, r8, r4
+; CHECK-NEXT: or. r11, r11, r10
+; CHECK-NEXT: bne cr0, .LBB9_3
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: mr r11, r7
+; CHECK-NEXT: mr r10, r6
+; CHECK-NEXT: stqcx. r10, 0, r3
+; CHECK-NEXT: bne cr0, .LBB9_1
+; CHECK-NEXT: b .LBB9_4
+; CHECK-NEXT: .LBB9_3: # %entry
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: .LBB9_4: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r8
+; CHECK-NEXT: mr r4, r9
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: cas_sc_sc:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: sync
+; PWR7-NEXT: bl __sync_val_compare_and_swap_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = cmpxchg i128* %a, i128 %cmp, i128 %new seq_cst seq_cst
+ %1 = extractvalue { i128, i1 } %0, 0
+ ret i128 %1
+}
+
+define i128 @cas_acqrel_acquire(i128* %a, i128 %cmp, i128 %new) {
+; CHECK-LABEL: cas_acqrel_acquire:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: .LBB10_1: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: lqarx r8, 0, r3
+; CHECK-NEXT: xor r11, r9, r5
+; CHECK-NEXT: xor r10, r8, r4
+; CHECK-NEXT: or. r11, r11, r10
+; CHECK-NEXT: bne cr0, .LBB10_3
+; CHECK-NEXT: # %bb.2: # %entry
+; CHECK-NEXT: #
+; CHECK-NEXT: mr r11, r7
+; CHECK-NEXT: mr r10, r6
+; CHECK-NEXT: stqcx. r10, 0, r3
+; CHECK-NEXT: bne cr0, .LBB10_1
+; CHECK-NEXT: b .LBB10_4
+; CHECK-NEXT: .LBB10_3: # %entry
+; CHECK-NEXT: stqcx. r8, 0, r3
+; CHECK-NEXT: .LBB10_4: # %entry
+; CHECK-NEXT: lwsync
+; CHECK-NEXT: mr r3, r8
+; CHECK-NEXT: mr r4, r9
+; CHECK-NEXT: blr
+;
+; PWR7-LABEL: cas_acqrel_acquire:
+; PWR7: # %bb.0: # %entry
+; PWR7-NEXT: mflr r0
+; PWR7-NEXT: std r0, 16(r1)
+; PWR7-NEXT: stdu r1, -112(r1)
+; PWR7-NEXT: .cfi_def_cfa_offset 112
+; PWR7-NEXT: .cfi_offset lr, 16
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: bl __sync_val_compare_and_swap_16
+; PWR7-NEXT: nop
+; PWR7-NEXT: lwsync
+; PWR7-NEXT: addi r1, r1, 112
+; PWR7-NEXT: ld r0, 16(r1)
+; PWR7-NEXT: mtlr r0
+; PWR7-NEXT: blr
+entry:
+ %0 = cmpxchg i128* %a, i128 %cmp, i128 %new acq_rel acquire
+ %1 = extractvalue { i128, i1 } %0, 0
+ ret i128 %1
+}
More information about the llvm-commits
mailing list