[llvm] 843c614 - [AArch64] Fix i128 cmpxchg using ldxp/stxp.
Eli Friedman via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 20 12:38:22 PDT 2021
Author: Eli Friedman
Date: 2021-07-20T12:38:12-07:00
New Revision: 843c61405881acdac1727c3a1ca4f36156611753
URL: https://github.com/llvm/llvm-project/commit/843c61405881acdac1727c3a1ca4f36156611753
DIFF: https://github.com/llvm/llvm-project/commit/843c61405881acdac1727c3a1ca4f36156611753.diff
LOG: [AArch64] Fix i128 cmpxchg using ldxp/stxp.
Basically two parts to this fix:
1. Stop using AtomicExpand to expand cmpxchg i128
2. Fix AArch64ExpandPseudoInsts to use a correct expansion.
>From ARM architecture reference:
To atomically load two 64-bit quantities, perform a Load-Exclusive
pair/Store-Exclusive pair sequence of reading and writing the same value
for which the Store-Exclusive pair succeeds, and use the read values
from the Load-Exclusive pair.
Fixes https://bugs.llvm.org/show_bug.cgi?id=51102
Differential Revision: https://reviews.llvm.org/D106039
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrAtomics.td
llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir
llvm/test/CodeGen/AArch64/arm64-atomic-128.ll
llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 4d303876ace2..f8fa9ff3c068 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -279,21 +279,46 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
Register NewLoReg = MI.getOperand(6).getReg();
Register NewHiReg = MI.getOperand(7).getReg();
+ unsigned LdxpOp, StxpOp;
+
+ switch (MI.getOpcode()) {
+ case AArch64::CMP_SWAP_128_MONOTONIC:
+ LdxpOp = AArch64::LDXPX;
+ StxpOp = AArch64::STXPX;
+ break;
+ case AArch64::CMP_SWAP_128_RELEASE:
+ LdxpOp = AArch64::LDXPX;
+ StxpOp = AArch64::STLXPX;
+ break;
+ case AArch64::CMP_SWAP_128_ACQUIRE:
+ LdxpOp = AArch64::LDAXPX;
+ StxpOp = AArch64::STXPX;
+ break;
+ case AArch64::CMP_SWAP_128:
+ LdxpOp = AArch64::LDAXPX;
+ StxpOp = AArch64::STLXPX;
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ auto FailBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
MF->insert(++MBB.getIterator(), LoadCmpBB);
MF->insert(++LoadCmpBB->getIterator(), StoreBB);
- MF->insert(++StoreBB->getIterator(), DoneBB);
+ MF->insert(++StoreBB->getIterator(), FailBB);
+ MF->insert(++FailBB->getIterator(), DoneBB);
// .Lloadcmp:
// ldaxp xDestLo, xDestHi, [xAddr]
// cmp xDestLo, xDesiredLo
// sbcs xDestHi, xDesiredHi
// b.ne .Ldone
- BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
+ BuildMI(LoadCmpBB, DL, TII->get(LdxpOp))
.addReg(DestLo.getReg(), RegState::Define)
.addReg(DestHi.getReg(), RegState::Define)
.addReg(AddrReg);
@@ -315,23 +340,37 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
.addImm(AArch64CC::EQ);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
.addUse(StatusReg, getKillRegState(StatusDead))
- .addMBB(DoneBB);
- LoadCmpBB->addSuccessor(DoneBB);
+ .addMBB(FailBB);
+ LoadCmpBB->addSuccessor(FailBB);
LoadCmpBB->addSuccessor(StoreBB);
// .Lstore:
// stlxp wStatus, xNewLo, xNewHi, [xAddr]
// cbnz wStatus, .Lloadcmp
- BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
+ BuildMI(StoreBB, DL, TII->get(StxpOp), StatusReg)
.addReg(NewLoReg)
.addReg(NewHiReg)
.addReg(AddrReg);
BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
.addReg(StatusReg, getKillRegState(StatusDead))
.addMBB(LoadCmpBB);
+ BuildMI(StoreBB, DL, TII->get(AArch64::B)).addMBB(DoneBB);
StoreBB->addSuccessor(LoadCmpBB);
StoreBB->addSuccessor(DoneBB);
+ // .Lfail:
+ // stlxp wStatus, xDestLo, xDestHi, [xAddr]
+ // cbnz wStatus, .Lloadcmp
+ BuildMI(FailBB, DL, TII->get(StxpOp), StatusReg)
+ .addReg(DestLo.getReg())
+ .addReg(DestHi.getReg())
+ .addReg(AddrReg);
+ BuildMI(FailBB, DL, TII->get(AArch64::CBNZW))
+ .addReg(StatusReg, getKillRegState(StatusDead))
+ .addMBB(LoadCmpBB);
+ FailBB->addSuccessor(LoadCmpBB);
+ FailBB->addSuccessor(DoneBB);
+
DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
DoneBB->transferSuccessors(&MBB);
@@ -343,9 +382,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
// Recompute liveness bottom up.
LivePhysRegs LiveRegs;
computeAndAddLiveIns(LiveRegs, *DoneBB);
+ computeAndAddLiveIns(LiveRegs, *FailBB);
computeAndAddLiveIns(LiveRegs, *StoreBB);
computeAndAddLiveIns(LiveRegs, *LoadCmpBB);
+
// Do an extra pass in the loop to get the loop carried dependencies right.
+ FailBB->clearLiveIns();
+ computeAndAddLiveIns(LiveRegs, *FailBB);
StoreBB->clearLiveIns();
computeAndAddLiveIns(LiveRegs, *StoreBB);
LoadCmpBB->clearLiveIns();
@@ -1093,6 +1136,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
AArch64::XZR, NextMBBI);
case AArch64::CMP_SWAP_128:
+ case AArch64::CMP_SWAP_128_RELEASE:
+ case AArch64::CMP_SWAP_128_ACQUIRE:
+ case AArch64::CMP_SWAP_128_MONOTONIC:
return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
case AArch64::AESMCrrTied:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 57138b1e44dc..267f8a5f9d22 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17008,6 +17008,7 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
assert(N->getValueType(0) == MVT::i128 &&
"AtomicCmpSwap on types less than 128 should be legal");
+ MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
@@ -17018,8 +17019,6 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
N->getOperand(0), // Chain in
};
- MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
-
unsigned Opcode;
switch (MemOp->getMergedOrdering()) {
case AtomicOrdering::Monotonic:
@@ -17056,6 +17055,25 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
return;
}
+ unsigned Opcode;
+ switch (MemOp->getMergedOrdering()) {
+ case AtomicOrdering::Monotonic:
+ Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
+ break;
+ case AtomicOrdering::Acquire:
+ Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
+ break;
+ case AtomicOrdering::Release:
+ Opcode = AArch64::CMP_SWAP_128_RELEASE;
+ break;
+ case AtomicOrdering::AcquireRelease:
+ case AtomicOrdering::SequentiallyConsistent:
+ Opcode = AArch64::CMP_SWAP_128;
+ break;
+ default:
+ llvm_unreachable("Unexpected ordering!");
+ }
+
auto Desired = splitInt128(N->getOperand(2), DAG);
auto New = splitInt128(N->getOperand(3), DAG);
SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
@@ -17063,8 +17081,6 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
SDNode *CmpSwap = DAG.getMachineNode(
AArch64::CMP_SWAP_128, SDLoc(N),
DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
-
- MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
@@ -17285,6 +17301,13 @@ AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return AtomicExpansionKind::None;
+
+ // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
+ // it.
+ unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
+ if (Size > 64)
+ return AtomicExpansionKind::None;
+
return AtomicExpansionKind::LLSC;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 27e1d8ee6b98..84573dac7e41 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -429,11 +429,16 @@ def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch),
}
let Constraints = "@earlyclobber $RdLo, at earlyclobber $RdHi, at earlyclobber $scratch",
- mayLoad = 1, mayStore = 1 in
-def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
- (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
- GPR64:$newLo, GPR64:$newHi), []>,
- Sched<[WriteAtomic]>;
+ mayLoad = 1, mayStore = 1 in {
+class cmp_swap_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32common:$scratch),
+ (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
+ GPR64:$newLo, GPR64:$newHi), []>,
+ Sched<[WriteAtomic]>;
+def CMP_SWAP_128 : cmp_swap_128;
+def CMP_SWAP_128_RELEASE : cmp_swap_128;
+def CMP_SWAP_128_ACQUIRE : cmp_swap_128;
+def CMP_SWAP_128_MONOTONIC : cmp_swap_128;
+}
// v8.1 Atomic instructions:
let Predicates = [HasLSE] in {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index d37be5ecf46f..04e2aa87db69 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1210,8 +1210,26 @@ bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
// The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
// can take arbitrary registers so it just has the normal GPR64 operands the
// rest of AArch64 is expecting.
+ auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
+ unsigned Opcode;
+ switch (Ordering) {
+ case AtomicOrdering::Acquire:
+ Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
+ break;
+ case AtomicOrdering::Release:
+ Opcode = AArch64::CMP_SWAP_128_RELEASE;
+ break;
+ case AtomicOrdering::AcquireRelease:
+ case AtomicOrdering::SequentiallyConsistent:
+ Opcode = AArch64::CMP_SWAP_128;
+ break;
+ default:
+ Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
+ break;
+ }
+
auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
- CAS = MIRBuilder.buildInstr(AArch64::CMP_SWAP_128, {DstLo, DstHi, Scratch},
+ CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
{Addr, DesiredI->getOperand(0),
DesiredI->getOperand(1), NewI->getOperand(0),
NewI->getOperand(1)});
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
index 219d47349d1e..85072688e42b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
@@ -24,7 +24,7 @@ define void @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
; CHECK-LLSC-O0: cmp [[OLD_HI]], x3
; CHECK-LLSC-O0: cinc [[EQUAL:w[0-9]+]], [[EQUAL_TMP]], ne
; CHECK-LLSC-O0: cbnz [[EQUAL]], .LBB0_3
-; CHECK-LLSC-O0: stlxp [[STATUS:w[0-9]+]], x4, x5, [x0]
+; CHECK-LLSC-O0: stxp [[STATUS:w[0-9]+]], x4, x5, [x0]
; CHECK-LLSC-O0: cbnz [[STATUS]], .LBB0_1
; CHECK-LLSC-O0: .LBB0_3:
; CHECK-LLSC-O0: mov v[[OLD:[0-9]+]].d[0], [[OLD_LO]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir
index 5c03edd472f4..f051fe7c604b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir
@@ -23,7 +23,7 @@ body: |
; CHECK: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64)
; CHECK: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64)
; CHECK: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64)
- ; CHECK: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32 = CMP_SWAP_128 [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire 16)
+ ; CHECK: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire 16)
; CHECK: [[COPY9:%[0-9]+]]:gpr64 = COPY %16
; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64)
; CHECK: G_STORE [[MV]](s128), [[COPY]](p0) :: (store 16)
@@ -39,7 +39,7 @@ body: |
; CHECK-NOLSE: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64)
; CHECK-NOLSE: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64)
; CHECK-NOLSE: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64)
- ; CHECK-NOLSE: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32 = CMP_SWAP_128 [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128))
+ ; CHECK-NOLSE: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128))
; CHECK-NOLSE: [[COPY9:%[0-9]+]]:gpr64 = COPY %16
; CHECK-NOLSE: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64)
; CHECK-NOLSE: G_STORE [[MV]](s128), [[COPY]](p0) :: (store (s128))
diff --git a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll
index 2409a3463df4..3495dceff399 100644
--- a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -8,11 +8,16 @@ define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
; CHECK-LABEL: val_compare_and_swap:
; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK: ldaxp [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x[[ADDR:[0-9]+]]]
-; CHECK-DAG: eor [[MISMATCH_LO:x[0-9]+]], [[RESULTLO]], x2
-; CHECK-DAG: eor [[MISMATCH_HI:x[0-9]+]], [[RESULTHI]], x3
-; CHECK: orr [[MISMATCH:x[0-9]+]], [[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK: cbnz [[MISMATCH]], [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK: stxp [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
+; CHECK: cmp
+; CHECK: cset
+; CHECK: cmp
+; CHECK: cinc [[MISMATCH:w[0-9]+]]
+; CHECK: cbz [[MISMATCH]], [[EQUAL:.LBB[0-9]+_[0-9]+]]
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[RESULTLO]], [[RESULTHI]], [x[[ADDR]]]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+; CHECK: b [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: [[EQUAL]]:
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
; CHECK: [[DONE]]:
%pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
@@ -183,8 +188,8 @@ define i128 @atomic_load_relaxed(i64, i64, i128* %p) {
; CHECK-NEXT: stxp [[SUCCESS:w[0-9]+]], [[LO]], [[HI]], [x2]
; CHECK: cbnz [[SUCCESS]], [[LABEL]]
; CHECK-NOT: dmb
- %r = load atomic i128, i128* %p monotonic, align 16
- ret i128 %r
+ %r = load atomic i128, i128* %p monotonic, align 16
+ ret i128 %r
}
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
index d3a1f144b851..0f2be66dc46e 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@@ -234,7 +234,12 @@ define i128 @test_rmw_add_128(i128* %dst) {
; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
; NOLSE-NEXT: stlxp w12, x14, x15, [x13]
; NOLSE-NEXT: cbnz w12, .LBB4_2
+; NOLSE-NEXT: b .LBB4_5
; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start
+; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
+; NOLSE-NEXT: stlxp w12, x10, x9, [x13]
+; NOLSE-NEXT: cbnz w12, .LBB4_2
+; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1
; NOLSE-NEXT: eor x11, x9, x11
; NOLSE-NEXT: eor x8, x10, x8
@@ -244,8 +249,8 @@ define i128 @test_rmw_add_128(i128* %dst) {
; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
; NOLSE-NEXT: cbnz x8, .LBB4_1
-; NOLSE-NEXT: b .LBB4_5
-; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT: b .LBB4_6
+; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload
; NOLSE-NEXT: add sp, sp, #48 // =48
@@ -630,7 +635,12 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
; NOLSE-NEXT: stlxp w12, x14, x15, [x13]
; NOLSE-NEXT: cbnz w12, .LBB9_2
+; NOLSE-NEXT: b .LBB9_5
; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start
+; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
+; NOLSE-NEXT: stlxp w12, x10, x9, [x13]
+; NOLSE-NEXT: cbnz w12, .LBB9_2
+; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1
; NOLSE-NEXT: eor x11, x9, x11
; NOLSE-NEXT: eor x8, x10, x8
@@ -640,8 +650,8 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
; NOLSE-NEXT: cbnz x8, .LBB9_1
-; NOLSE-NEXT: b .LBB9_5
-; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT: b .LBB9_6
+; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload
; NOLSE-NEXT: add sp, sp, #48 // =48
More information about the llvm-commits
mailing list