[llvm] 13aa102 - AArch64: use ldp/stp for 128-bit atomic load/store in v.84 onwards
Tim Northover via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 20 01:50:16 PDT 2021
Author: Tim Northover
Date: 2021-09-20T09:50:11+01:00
New Revision: 13aa102e07695297fd17f68913c343c95a7c56ad
URL: https://github.com/llvm/llvm-project/commit/13aa102e07695297fd17f68913c343c95a7c56ad
DIFF: https://github.com/llvm/llvm-project/commit/13aa102e07695297fd17f68913c343c95a7c56ad.diff
LOG: AArch64: use ldp/stp for 128-bit atomic load/store in v.84 onwards
v8.4 says that normal loads/stores of 128-bytes are single-copy atomic if
they're properly aligned (which all LLVM atomics are) so we no longer need to
do a full RMW operation to guarantee we got a clean read.
Added:
llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll
llvm/test/CodeGen/AArch64/v8.4-atomic-128.ll
Modified:
llvm/lib/Target/AArch64/AArch64.td
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64Subtarget.h
llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index c8c824231b779..a7af4e61e59ef 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -61,6 +61,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
"Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
+def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
+ "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">;
+
def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
"Enable out of line atomics to support LSE instructions">;
@@ -459,7 +462,7 @@ def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
"Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
FeatureNV, FeatureMPAM, FeatureDIT,
FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
- FeatureFlagM, FeatureRCPC_IMMO]>;
+ FeatureFlagM, FeatureRCPC_IMMO, FeatureLSE2]>;
def HasV8_5aOps : SubtargetFeature<
"v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 52e535a2f08db..186a954e75039 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -785,6 +785,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::i128, Custom);
+ // Aligned 128-bit loads and stores are single-copy atomic according to the
+ // v8.4a spec.
+ if (Subtarget->hasLSE2()) {
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
+ }
+
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
// custom lowering, as there are no un-paired non-temporal stores and
// legalization will break up 256 bit inputs.
@@ -4681,18 +4688,7 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
return Result;
}
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
- assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
- SDValue Lo =
- DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
- DAG.getConstant(0, Dl, MVT::i64));
- SDValue Hi =
- DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
- DAG.getConstant(1, Dl, MVT::i64));
- SDValue Result = DAG.getMemIntrinsicNode(
- AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
- {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
- StoreNode->getMemoryVT(), StoreNode->getMemOperand());
- return Result;
+ return LowerStore128(Op, DAG);
} else if (MemVT == MVT::i64x8) {
SDValue Value = StoreNode->getValue();
assert(Value->getValueType(0) == MVT::i64x8);
@@ -4713,6 +4709,31 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
return SDValue();
}
+/// Lower atomic or volatile 128-bit stores to a single STP instruction.
+SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
+ SelectionDAG &DAG) const {
+ MemSDNode *StoreNode = cast<MemSDNode>(Op);
+ assert(StoreNode->getMemoryVT() == MVT::i128);
+ assert(StoreNode->isVolatile() || StoreNode->isAtomic());
+ assert(!StoreNode->isAtomic() ||
+ StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
+ StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
+
+ SDValue Value = StoreNode->getOpcode() == ISD::STORE
+ ? StoreNode->getOperand(1)
+ : StoreNode->getOperand(2);
+ SDLoc DL(Op);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
+ DAG.getConstant(0, DL, MVT::i64));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
+ DAG.getConstant(1, DL, MVT::i64));
+ SDValue Result = DAG.getMemIntrinsicNode(
+ AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
+ {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+ StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+ return Result;
+}
+
SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -4950,6 +4971,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
/*OverrideNEON=*/true);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::ATOMIC_STORE:
+ if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
+ assert(Subtarget->hasLSE2());
+ return LowerStore128(Op, DAG);
+ }
+ return SDValue();
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::MSTORE:
@@ -17502,12 +17529,14 @@ void AArch64TargetLowering::ReplaceNodeResults(
case ISD::ATOMIC_CMP_SWAP:
ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
return;
+ case ISD::ATOMIC_LOAD:
case ISD::LOAD: {
assert(SDValue(N, 0).getValueType() == MVT::i128 &&
"unexpected load's value type");
- LoadSDNode *LoadNode = cast<LoadSDNode>(N);
- if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
- // Non-volatile loads are optimized later in AArch64's load/store
+ MemSDNode *LoadNode = cast<MemSDNode>(N);
+ if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
+ LoadNode->getMemoryVT() != MVT::i128) {
+ // Non-volatile or atomic loads are optimized later in AArch64's load/store
// optimizer.
return;
}
@@ -17598,12 +17627,37 @@ AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
+// provided the address is 16-byte aligned.
+bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
+ if (!Subtarget->hasLSE2())
+ return false;
+
+ if (auto LI = dyn_cast<LoadInst>(I))
+ return LI->getType()->getPrimitiveSizeInBits() == 128 &&
+ LI->getAlignment() >= 16;
+
+ if (auto SI = dyn_cast<StoreInst>(I))
+ return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
+ SI->getAlignment() >= 16;
+
+ return false;
+}
+
+bool AArch64TargetLowering::shouldInsertFencesForAtomic(
+ const Instruction *I) const {
+ return isOpSuitableForLDPSTP(I);
+}
+
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
- return Size == 128;
+ if (Size != 128)
+ return false;
+
+ return !isOpSuitableForLDPSTP(SI);
}
// Loads and stores less than 128-bits are already atomic; ones above that
@@ -17612,7 +17666,11 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
- return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+
+ if (Size != 128 || isOpSuitableForLDPSTP(LI))
+ return AtomicExpansionKind::None;
+
+ return AtomicExpansionKind::LLSC;
}
// For the real atomic operations, we have ldxr/stxr up to 128 bits,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00ed3171ccbac..df60f6f2f67e9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -660,6 +660,9 @@ class AArch64TargetLowering : public TargetLowering {
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
+ bool isOpSuitableForLDPSTP(const Instruction *I) const;
+ bool shouldInsertFencesForAtomic(const Instruction *I) const override;
+
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
@@ -863,6 +866,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerStore128(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 38e09215fd7da..ac8b78ac105db 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -99,6 +99,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool HasDotProd = false;
bool HasCRC = false;
bool HasLSE = false;
+ bool HasLSE2 = false;
bool HasRAS = false;
bool HasRDM = false;
bool HasPerfMon = false;
@@ -375,6 +376,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool hasDotProd() const { return HasDotProd; }
bool hasCRC() const { return HasCRC; }
bool hasLSE() const { return HasLSE; }
+ bool hasLSE2() const { return HasLSE2; }
bool hasRAS() const { return HasRAS; }
bool hasRDM() const { return HasRDM; }
bool hasSM4() const { return HasSM4; }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index b82fa1654169e..8961b8085690c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -824,6 +824,8 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
return isStore ? AArch64::STRSui : AArch64::LDRSui;
case 64:
return isStore ? AArch64::STRDui : AArch64::LDRDui;
+ case 128:
+ return isStore ? AArch64::STRQui : AArch64::LDRQui;
}
break;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 779e9dfd0ac03..cb9e51c2e6c82 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -16,6 +16,7 @@
#include "AArch64Subtarget.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -35,6 +36,7 @@ using namespace llvm;
using namespace LegalizeActions;
using namespace LegalizeMutations;
using namespace LegalityPredicates;
+using namespace MIPatternMatch;
AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
: ST(&ST) {
@@ -278,6 +280,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
};
getActionDefinitionsBuilder(G_LOAD)
+ .customIf([=](const LegalityQuery &Query) {
+ return Query.Types[0] == s128 &&
+ Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
+ })
.legalForTypesWithMemDesc({{s8, p0, s8, 8},
{s16, p0, s16, 8},
{s32, p0, s32, 8},
@@ -316,6 +322,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.scalarizeIf(typeIs(0, v2s16), 0);
getActionDefinitionsBuilder(G_STORE)
+ .customIf([=](const LegalityQuery &Query) {
+ return Query.Types[0] == s128 &&
+ Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
+ })
.legalForTypesWithMemDesc({{s8, p0, s8, 8},
{s16, p0, s8, 8}, // truncstorei8 from s16
{s32, p0, s8, 8}, // truncstorei8 from s32
@@ -992,6 +1002,20 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
return true;
}
+static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
+ MachineRegisterInfo &MRI) {
+ Base = Root;
+ Offset = 0;
+
+ Register NewBase;
+ int64_t NewOffset;
+ if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
+ isShiftedInt<7, 3>(NewOffset)) {
+ Base = NewBase;
+ Offset = NewOffset;
+ }
+}
+
// FIXME: This should be removed and replaced with the generic bitcast legalize
// action.
bool AArch64LegalizerInfo::legalizeLoadStore(
@@ -1011,6 +1035,36 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
Register ValReg = MI.getOperand(0).getReg();
const LLT ValTy = MRI.getType(ValReg);
+ if (ValTy == LLT::scalar(128)) {
+ assert((*MI.memoperands_begin())->getSuccessOrdering() ==
+ AtomicOrdering::Monotonic ||
+ (*MI.memoperands_begin())->getSuccessOrdering() ==
+ AtomicOrdering::Unordered);
+ assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
+ LLT s64 = LLT::scalar(64);
+ MachineInstrBuilder NewI;
+ if (MI.getOpcode() == TargetOpcode::G_LOAD) {
+ NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {});
+ MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
+ } else {
+ auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
+ NewI = MIRBuilder.buildInstr(
+ AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)});
+ }
+ Register Base;
+ int Offset;
+ matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
+ NewI.addUse(Base);
+ NewI.addImm(Offset / 8);
+
+ NewI.cloneMemRefs(MI);
+ constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
+ *MRI.getTargetRegisterInfo(),
+ *ST->getRegBankInfo());
+ MI.eraseFromParent();
+ return true;
+ }
+
if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
ValTy.getElementType().getAddressSpace() != 0) {
LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
index 4676592bc9718..ecf197009407d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O1
-; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O0
-; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
+; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
@var = global i128 0
define void @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
@@ -411,7 +411,7 @@ define void @atomic_load_relaxed(i64, i64, i128* %p, i128* %p2) {
; CHECK-CAS-O0-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-CAS-O0-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
; CHECK-CAS-O0-NEXT: ldxp x9, x10, [x11]
-; CHECK-CAS-O0-NEXT: mov x8, #0
+; CHECK-CAS-O0-NEXT: mov x8, xzr
; CHECK-CAS-O0-NEXT: orr x9, x9, x8
; CHECK-CAS-O0-NEXT: orr x10, x8, x10
; CHECK-CAS-O0-NEXT: // implicit-def: $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll
new file mode 100644
index 0000000000000..397d69e93c5f0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll
@@ -0,0 +1,212 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+v8.4a %s -o - -global-isel=1 -global-isel-abort=1 | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse2 %s -o - -global-isel=1 -global-isel-abort=1 | FileCheck %s
+
+define void @test_atomic_load(i128* %addr) {
+; CHECK-LABEL: test_atomic_load:
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %res.0 = load atomic i128, i128* %addr monotonic, align 16
+ store i128 %res.0, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %res.1 = load atomic i128, i128* %addr unordered, align 16
+ store i128 %res.1, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: dmb ish
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %res.2 = load atomic i128, i128* %addr acquire, align 16
+ store i128 %res.2, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: dmb ish
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %res.3 = load atomic i128, i128* %addr seq_cst, align 16
+ store i128 %res.3, i128* %addr
+
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #8]
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 8
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ %res.5 = load atomic i128, i128* %addr128.1 monotonic, align 16
+ store i128 %res.5, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504]
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %addr8.2 = getelementptr i8, i8* %addr8, i32 504
+ %addr128.2 = bitcast i8* %addr8.2 to i128*
+ %res.6 = load atomic i128, i128* %addr128.2 monotonic, align 16
+ store i128 %res.6, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512]
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %addr8.3 = getelementptr i8, i8* %addr8, i32 -512
+ %addr128.3 = bitcast i8* %addr8.3 to i128*
+ %res.7 = load atomic i128, i128* %addr128.3 monotonic, align 16
+ store i128 %res.7, i128* %addr
+
+ ret void
+}
+
+define void @test_libcall_load(i128* %addr) {
+; CHECK-LABEL: test_libcall_load:
+; CHECK: bl __atomic_load
+ %res.8 = load atomic i128, i128* %addr unordered, align 8
+ store i128 %res.8, i128* %addr
+
+ ret void
+}
+
+define void @test_nonfolded_load1(i128* %addr) {
+; CHECK-LABEL: test_nonfolded_load1:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: add x[[ADDR:[0-9]+]], x0, #4
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 4
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
+ store i128 %res.1, i128* %addr
+
+ ret void
+}
+
+define void @test_nonfolded_load2(i128* %addr) {
+; CHECK-LABEL: test_nonfolded_load2:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: add x[[ADDR:[0-9]+]], x0, #512
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 512
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
+ store i128 %res.1, i128* %addr
+
+ ret void
+}
+
+define void @test_nonfolded_load3(i128* %addr) {
+; CHECK-LABEL: test_nonfolded_load3:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
+; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
+; CHECK: mov v[[Q]].d[1], [[HI]]
+; CHECK: str q[[Q]], [x0]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 -520
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
+ store i128 %res.1, i128* %addr
+
+ ret void
+}
+
+define void @test_atomic_store(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_atomic_store:
+
+; CHECK: stp x2, x3, [x0]
+ store atomic i128 %val, i128* %addr monotonic, align 16
+
+; CHECK: stp x2, x3, [x0]
+ store atomic i128 %val, i128* %addr unordered, align 16
+
+; CHECK: dmb ish
+; CHECK: stp x2, x3, [x0]
+ store atomic i128 %val, i128* %addr release, align 16
+
+; CHECK: dmb ish
+; CHECK: stp x2, x3, [x0]
+; CHECK: dmb ish
+ store atomic i128 %val, i128* %addr seq_cst, align 16
+
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: stp x2, x3, [x0, #8]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 8
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ store atomic i128 %val, i128* %addr128.1 monotonic, align 16
+
+; CHECK: stp x2, x3, [x0, #504]
+ %addr8.2 = getelementptr i8, i8* %addr8, i32 504
+ %addr128.2 = bitcast i8* %addr8.2 to i128*
+ store atomic i128 %val, i128* %addr128.2 monotonic, align 16
+
+; CHECK: stp x2, x3, [x0, #-512]
+ %addr8.3 = getelementptr i8, i8* %addr8, i32 -512
+ %addr128.3 = bitcast i8* %addr8.3 to i128*
+ store atomic i128 %val, i128* %addr128.3 monotonic, align 16
+
+ ret void
+}
+
+define void @test_libcall_store(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_libcall_store:
+; CHECK: bl __atomic_store
+ store atomic i128 %val, i128* %addr unordered, align 8
+
+ ret void
+}
+
+define void @test_nonfolded_store1(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_nonfolded_store1:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: add x[[ADDR:[0-9]+]], x0, #4
+; CHECK: stp x2, x3, [x[[ADDR]]]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 4
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ store atomic i128 %val, i128* %addr128.1 monotonic, align 16
+
+ ret void
+}
+
+define void @test_nonfolded_store2(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_nonfolded_store2:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: add x[[ADDR:[0-9]+]], x0, #512
+; CHECK: stp x2, x3, [x[[ADDR]]]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 512
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ store atomic i128 %val, i128* %addr128.1 monotonic, align 16
+
+ ret void
+}
+
+define void @test_nonfolded_store3(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_nonfolded_store3:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
+; CHECK: stp x2, x3, [x[[ADDR]]]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 -520
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ store atomic i128 %val, i128* %addr128.1 monotonic, align 16
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
index 2ec21c832a47f..1bc8a9606ca25 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
@@ -3,7 +3,7 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s --check-prefix=CHECK-REG
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira -mattr=-lse2 < %s | FileCheck %s
; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
; (i.e. reusing a register for status & data in store exclusive).
diff --git a/llvm/test/CodeGen/AArch64/v8.4-atomic-128.ll b/llvm/test/CodeGen/AArch64/v8.4-atomic-128.ll
new file mode 100644
index 0000000000000..2cd260ead8569
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/v8.4-atomic-128.ll
@@ -0,0 +1,194 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+v8.4a %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse2 %s -o - | FileCheck %s
+
+define void @test_atomic_load(i128* %addr) {
+; CHECK-LABEL: test_atomic_load:
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
+ %res.0 = load atomic i128, i128* %addr monotonic, align 16
+ store i128 %res.0, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
+ %res.1 = load atomic i128, i128* %addr unordered, align 16
+ store i128 %res.1, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: dmb ish
+; CHECK: stp [[LO]], [[HI]], [x0]
+ %res.2 = load atomic i128, i128* %addr acquire, align 16
+ store i128 %res.2, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: dmb ish
+; CHECK: stp [[LO]], [[HI]], [x0]
+ %res.3 = load atomic i128, i128* %addr seq_cst, align 16
+ store i128 %res.3, i128* %addr
+
+
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #32]
+; CHECK-DAG: stp [[LO]], [[HI]], [x0]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 32
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ %res.5 = load atomic i128, i128* %addr128.1 monotonic, align 16
+ store i128 %res.5, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504]
+; CHECK: stp [[LO]], [[HI]], [x0]
+ %addr8.2 = getelementptr i8, i8* %addr8, i32 504
+ %addr128.2 = bitcast i8* %addr8.2 to i128*
+ %res.6 = load atomic i128, i128* %addr128.2 monotonic, align 16
+ store i128 %res.6, i128* %addr
+
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512]
+; CHECK: stp [[LO]], [[HI]], [x0]
+ %addr8.3 = getelementptr i8, i8* %addr8, i32 -512
+ %addr128.3 = bitcast i8* %addr8.3 to i128*
+ %res.7 = load atomic i128, i128* %addr128.3 monotonic, align 16
+ store i128 %res.7, i128* %addr
+
+ ret void
+}
+
+define void @test_libcall_load(i128* %addr) {
+; CHECK-LABEL: test_libcall_load:
+; CHECK: bl __atomic_load
+ %res.8 = load atomic i128, i128* %addr unordered, align 8
+ store i128 %res.8, i128* %addr
+
+ ret void
+}
+
+define void @test_nonfolded_load1(i128* %addr) {
+; CHECK-LABEL: test_nonfolded_load1:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: add x[[ADDR:[0-9]+]], x0, #4
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
+; CHECK: stp [[LO]], [[HI]], [x0]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 4
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
+ store i128 %res.1, i128* %addr
+
+ ret void
+}
+
+define void @test_nonfolded_load2(i128* %addr) {
+; CHECK-LABEL: test_nonfolded_load2:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: add x[[ADDR:[0-9]+]], x0, #512
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
+; CHECK: stp [[LO]], [[HI]], [x0]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 512
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
+ store i128 %res.1, i128* %addr
+
+ ret void
+}
+
+define void @test_nonfolded_load3(i128* %addr) {
+; CHECK-LABEL: test_nonfolded_load3:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
+; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
+; CHECK: stp [[LO]], [[HI]], [x0]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 -520
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ %res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
+ store i128 %res.1, i128* %addr
+
+ ret void
+}
+
+define void @test_atomic_store(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_atomic_store:
+
+; CHECK: stp x2, x3, [x0]
+ store atomic i128 %val, i128* %addr monotonic, align 16
+
+; CHECK: stp x2, x3, [x0]
+ store atomic i128 %val, i128* %addr unordered, align 16
+
+; CHECK: dmb ish
+; CHECK: stp x2, x3, [x0]
+ store atomic i128 %val, i128* %addr release, align 16
+
+; CHECK: dmb ish
+; CHECK: stp x2, x3, [x0]
+; CHECK: dmb ish
+ store atomic i128 %val, i128* %addr seq_cst, align 16
+
+
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: stp x2, x3, [x0, #8]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 8
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ store atomic i128 %val, i128* %addr128.1 monotonic, align 16
+
+; CHECK: stp x2, x3, [x0, #504]
+ %addr8.2 = getelementptr i8, i8* %addr8, i32 504
+ %addr128.2 = bitcast i8* %addr8.2 to i128*
+ store atomic i128 %val, i128* %addr128.2 monotonic, align 16
+
+; CHECK: stp x2, x3, [x0, #-512]
+ %addr8.3 = getelementptr i8, i8* %addr8, i32 -512
+ %addr128.3 = bitcast i8* %addr8.3 to i128*
+ store atomic i128 %val, i128* %addr128.3 monotonic, align 16
+
+ ret void
+}
+
+define void @test_libcall_store(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_libcall_store:
+; CHECK: bl __atomic_store
+ store atomic i128 %val, i128* %addr unordered, align 8
+
+ ret void
+}
+
+define void @test_nonfolded_store1(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_nonfolded_store1:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: add x[[ADDR:[0-9]+]], x0, #4
+; CHECK: stp x2, x3, [x[[ADDR]]]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 4
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ store atomic i128 %val, i128* %addr128.1 monotonic, align 16
+
+ ret void
+}
+
+define void @test_nonfolded_store2(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_nonfolded_store2:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: add x[[ADDR:[0-9]+]], x0, #512
+; CHECK: stp x2, x3, [x[[ADDR]]]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 512
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ store atomic i128 %val, i128* %addr128.1 monotonic, align 16
+
+ ret void
+}
+
+define void @test_nonfolded_store3(i128* %addr, i128 %val) {
+; CHECK-LABEL: test_nonfolded_store3:
+ %addr8 = bitcast i128* %addr to i8*
+
+; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
+; CHECK: stp x2, x3, [x[[ADDR]]]
+ %addr8.1 = getelementptr i8, i8* %addr8, i32 -520
+ %addr128.1 = bitcast i8* %addr8.1 to i128*
+ store atomic i128 %val, i128* %addr128.1 monotonic, align 16
+
+ ret void
+}
More information about the llvm-commits
mailing list