[llvm-branch-commits] [llvm] AArch64: Use consistent atomicrmw expansion for FP operations (PR #103702)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Aug 13 23:46:42 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
Use LLSC or cmpxchg in the same cases as for the unsupported
integer operations. This required some fixups to the LLSC
implementatation to deal with the fp128 case.
---
Patch is 88.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/103702.diff
5 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+18-12)
- (modified) llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll (+94-298)
- (modified) llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll (+107-311)
- (modified) llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll (+107-311)
- (modified) llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll (+94-298)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 40d9fa4f2b494a..26f698898e487b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27061,9 +27061,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
- if (AI->isFloatingPointOperation())
- return AtomicExpansionKind::CmpXChg;
-
bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
(AI->getOperation() == AtomicRMWInst::Xchg ||
AI->getOperation() == AtomicRMWInst::Or ||
@@ -27073,7 +27070,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
// Nand is not supported in LSE.
// Leave 128 bits to LLSC or CmpXChg.
- if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
+ if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
+ !AI->isFloatingPointOperation()) {
if (Subtarget->hasLSE())
return AtomicExpansionKind::None;
if (Subtarget->outlineAtomics()) {
@@ -27146,10 +27144,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
- Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
- Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
- return Builder.CreateOr(
- Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
+
+ auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
+ Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
+ Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
+
+ Value *Or = Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
+ return Builder.CreateBitCast(Or, ValueTy);
}
Type *Tys[] = { Addr->getType() };
@@ -27160,8 +27162,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
const DataLayout &DL = M->getDataLayout();
IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
CallInst *CI = Builder.CreateCall(Ldxr, Addr);
- CI->addParamAttr(
- 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
+ CI->addParamAttr(0, Attribute::get(Builder.getContext(),
+ Attribute::ElementType, IntEltTy));
Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
return Builder.CreateBitCast(Trunc, ValueTy);
@@ -27187,9 +27189,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
Function *Stxr = Intrinsic::getDeclaration(M, Int);
Type *Int64Ty = Type::getInt64Ty(M->getContext());
+ Type *Int128Ty = Type::getInt128Ty(M->getContext());
+
+ Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
- Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
- Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+ Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
+ Value *Hi =
+ Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
}
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index f95caf325b197c..2c6461097f7d9b 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -6,33 +6,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
; NOLSE: // %bb.0:
; NOLSE-NEXT: fcvt s1, h0
-; NOLSE-NEXT: ldr h0, [x0]
-; NOLSE-NEXT: b .LBB0_2
; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1
-; NOLSE-NEXT: fmov s0, w10
-; NOLSE-NEXT: cmp w10, w9, uxth
-; NOLSE-NEXT: b.eq .LBB0_5
-; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxrh w8, [x0]
+; NOLSE-NEXT: fmov s0, w8
; NOLSE-NEXT: fcvt s2, h0
-; NOLSE-NEXT: fmov w9, s0
; NOLSE-NEXT: fadd s2, s2, s1
; NOLSE-NEXT: fcvt h2, s2
; NOLSE-NEXT: fmov w8, s2
-; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxrh w10, [x0]
-; NOLSE-NEXT: cmp w10, w9, uxth
-; NOLSE-NEXT: b.ne .LBB0_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2
-; NOLSE-NEXT: stlxrh wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB0_3
-; NOLSE-NEXT: b .LBB0_1
-; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxrh w9, w8, [x0]
+; NOLSE-NEXT: cbnz w9, .LBB0_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0
; NOLSE-NEXT: ret
;
@@ -63,33 +47,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
; NOLSE: // %bb.0:
; NOLSE-NEXT: fcvt s1, h0
-; NOLSE-NEXT: ldr h0, [x0]
-; NOLSE-NEXT: b .LBB1_2
; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1
-; NOLSE-NEXT: fmov s0, w10
-; NOLSE-NEXT: cmp w10, w9, uxth
-; NOLSE-NEXT: b.eq .LBB1_5
-; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxrh w8, [x0]
+; NOLSE-NEXT: fmov s0, w8
; NOLSE-NEXT: fcvt s2, h0
-; NOLSE-NEXT: fmov w9, s0
; NOLSE-NEXT: fadd s2, s2, s1
; NOLSE-NEXT: fcvt h2, s2
; NOLSE-NEXT: fmov w8, s2
-; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxrh w10, [x0]
-; NOLSE-NEXT: cmp w10, w9, uxth
-; NOLSE-NEXT: b.ne .LBB1_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2
-; NOLSE-NEXT: stlxrh wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB1_3
-; NOLSE-NEXT: b .LBB1_1
-; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxrh w9, w8, [x0]
+; NOLSE-NEXT: cbnz w9, .LBB1_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0
; NOLSE-NEXT: ret
;
@@ -122,19 +90,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0
; NOLSE-NEXT: fmov w9, s0
; NOLSE-NEXT: mov w8, #32767 // =0x7fff
-; NOLSE-NEXT: ldr h0, [x0]
; NOLSE-NEXT: lsl w9, w9, #16
; NOLSE-NEXT: fmov s1, w9
-; NOLSE-NEXT: b .LBB2_2
; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1
-; NOLSE-NEXT: fmov s0, w11
-; NOLSE-NEXT: cmp w11, w9, uxth
-; NOLSE-NEXT: b.eq .LBB2_5
-; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB2_3 Depth 2
-; NOLSE-NEXT: fmov w9, s0
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxrh w9, [x0]
+; NOLSE-NEXT: fmov s0, w9
; NOLSE-NEXT: lsl w9, w9, #16
; NOLSE-NEXT: fmov s2, w9
; NOLSE-NEXT: fadd s2, s2, s1
@@ -143,21 +104,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
; NOLSE-NEXT: add w9, w9, w8
; NOLSE-NEXT: add w9, w10, w9
; NOLSE-NEXT: lsr w9, w9, #16
-; NOLSE-NEXT: fmov s2, w9
-; NOLSE-NEXT: fmov w9, s0
-; NOLSE-NEXT: fmov w10, s2
-; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxrh w11, [x0]
-; NOLSE-NEXT: cmp w11, w9, uxth
-; NOLSE-NEXT: b.ne .LBB2_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2
-; NOLSE-NEXT: stlxrh wzr, w10, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB2_3
-; NOLSE-NEXT: b .LBB2_1
-; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxrh w10, w9, [x0]
+; NOLSE-NEXT: cbnz w10, .LBB2_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0
; NOLSE-NEXT: ret
;
@@ -199,19 +148,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0
; NOLSE-NEXT: fmov w9, s0
; NOLSE-NEXT: mov w8, #32767 // =0x7fff
-; NOLSE-NEXT: ldr h0, [x0]
; NOLSE-NEXT: lsl w9, w9, #16
; NOLSE-NEXT: fmov s1, w9
-; NOLSE-NEXT: b .LBB3_2
; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1
-; NOLSE-NEXT: fmov s0, w11
-; NOLSE-NEXT: cmp w11, w9, uxth
-; NOLSE-NEXT: b.eq .LBB3_5
-; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB3_3 Depth 2
-; NOLSE-NEXT: fmov w9, s0
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxrh w9, [x0]
+; NOLSE-NEXT: fmov s0, w9
; NOLSE-NEXT: lsl w9, w9, #16
; NOLSE-NEXT: fmov s2, w9
; NOLSE-NEXT: fadd s2, s2, s1
@@ -220,21 +162,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
; NOLSE-NEXT: add w9, w9, w8
; NOLSE-NEXT: add w9, w10, w9
; NOLSE-NEXT: lsr w9, w9, #16
-; NOLSE-NEXT: fmov s2, w9
-; NOLSE-NEXT: fmov w9, s0
-; NOLSE-NEXT: fmov w10, s2
-; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxrh w11, [x0]
-; NOLSE-NEXT: cmp w11, w9, uxth
-; NOLSE-NEXT: b.ne .LBB3_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2
-; NOLSE-NEXT: stlxrh wzr, w10, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB3_3
-; NOLSE-NEXT: b .LBB3_1
-; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxrh w10, w9, [x0]
+; NOLSE-NEXT: cbnz w10, .LBB3_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0
; NOLSE-NEXT: ret
;
@@ -273,31 +203,15 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: ldr s1, [x0]
-; NOLSE-NEXT: b .LBB4_2
; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1
-; NOLSE-NEXT: fmov s1, w10
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.eq .LBB4_5
-; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxr w8, [x0]
+; NOLSE-NEXT: fmov s1, w8
; NOLSE-NEXT: fadd s2, s1, s0
-; NOLSE-NEXT: fmov w9, s1
; NOLSE-NEXT: fmov w8, s2
-; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxr w10, [x0]
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.ne .LBB4_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2
-; NOLSE-NEXT: stlxr wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB4_3
-; NOLSE-NEXT: b .LBB4_1
-; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxr w9, w8, [x0]
+; NOLSE-NEXT: cbnz w9, .LBB4_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: fmov s0, s1
; NOLSE-NEXT: ret
;
@@ -324,31 +238,15 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0
define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: ldr d1, [x0]
-; NOLSE-NEXT: b .LBB5_2
; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1
-; NOLSE-NEXT: fmov d1, x10
-; NOLSE-NEXT: cmp x10, x9
-; NOLSE-NEXT: b.eq .LBB5_5
-; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxr x8, [x0]
+; NOLSE-NEXT: fmov d1, x8
; NOLSE-NEXT: fadd d2, d1, d0
-; NOLSE-NEXT: fmov x9, d1
; NOLSE-NEXT: fmov x8, d2
-; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxr x10, [x0]
-; NOLSE-NEXT: cmp x10, x9
-; NOLSE-NEXT: b.ne .LBB5_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2
-; NOLSE-NEXT: stlxr wzr, x8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB5_3
-; NOLSE-NEXT: b .LBB5_1
-; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT: stlxr w9, x8, [x0]
+; NOLSE-NEXT: cbnz w9, .LBB5_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
; NOLSE-NEXT: fmov d0, d1
; NOLSE-NEXT: ret
;
@@ -375,54 +273,26 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #
define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: sub sp, sp, #96
-; NOLSE-NEXT: ldr q1, [x0]
-; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; NOLSE-NEXT: sub sp, sp, #80
+; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
; NOLSE-NEXT: mov x19, x0
-; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill
-; NOLSE-NEXT: b .LBB6_2
+; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1
-; NOLSE-NEXT: stp x12, x13, [sp, #32]
-; NOLSE-NEXT: cmp x13, x10
-; NOLSE-NEXT: ldr q1, [sp, #32]
-; NOLSE-NEXT: ccmp x12, x11, #0, eq
-; NOLSE-NEXT: b.eq .LBB6_6
-; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB6_3 Depth 2
-; NOLSE-NEXT: mov v0.16b, v1.16b
-; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
-; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxp x8, x9, [x19]
+; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
+; NOLSE-NEXT: stp x8, x9, [sp, #48]
+; NOLSE-NEXT: ldr q0, [sp, #48]
+; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill
; NOLSE-NEXT: bl __addtf3
-; NOLSE-NEXT: str q0, [sp, #48]
-; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
-; NOLSE-NEXT: ldp x9, x8, [sp, #48]
-; NOLSE-NEXT: str q0, [sp, #64]
-; NOLSE-NEXT: ldp x11, x10, [sp, #64]
-; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxp x12, x13, [x19]
-; NOLSE-NEXT: cmp x12, x11
-; NOLSE-NEXT: cset w14, ne
-; NOLSE-NEXT: cmp x13, x10
-; NOLSE-NEXT: cinc w14, w14, ne
-; NOLSE-NEXT: cbz w14, .LBB6_5
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT: stlxp w14, x12, x13, [x19]
-; NOLSE-NEXT: cbnz w14, .LBB6_3
-; NOLSE-NEXT: b .LBB6_1
-; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT: stlxp w14, x9, x8, [x19]
-; NOLSE-NEXT: cbnz w14, .LBB6_3
-; NOLSE-NEXT: b .LBB6_1
-; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end
-; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; NOLSE-NEXT: mov v0.16b, v1.16b
-; NOLSE-NEXT: add sp, sp, #96
+; NOLSE-NEXT: str q0, [sp, #32]
+; NOLSE-NEXT: ldp x9, x8, [sp, #32]
+; NOLSE-NEXT: stlxp w10, x9, x8, [x19]
+; NOLSE-NEXT: cbnz w10, .LBB6_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT: add sp, sp, #80
; NOLSE-NEXT: ret
;
; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16:
@@ -463,35 +333,19 @@ define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0
define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
; NOLSE: // %bb.0:
-; NOLSE-NEXT: fcvtl v1.4s, v0.4h
-; NOLSE-NEXT: ldr s0, [x0]
-; NOLSE-NEXT: b .LBB7_2
+; NOLSE-NEXT: fcvtl v0.4s, v0.4h
; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1
-; NOLSE-NEXT: fmov s0, w10
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.eq .LBB7_5
-; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB7_3 Depth 2
-; NOLSE-NEXT: fcvtl v2.4s, v0.4h
-; NOLSE-NEXT: fmov w9, s0
-; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s
-; NOLSE-NEXT: fcvtn v2.4h, v2.4s
-; NOLSE-NEXT: fmov w8, s2
-; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxr w10, [x0]
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.ne .LBB7_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2
-; NOLSE-NEXT: stlxr wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB7_3
-; NOLSE-NEXT: b .LBB7_1
-; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end
-; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxr w8, [x0]
+; NOLSE-NEXT: fmov s1, w8
+; NOLSE-NEXT: fcvtl v1.4s, v1.4h
+; NOLSE-NEXT: fadd v1.4s, v1.4s, v0.4s
+; NOLSE-NEXT: fcvtn v1.4h, v1.4s
+; NOLSE-NEXT: fmov w9, s1
+; NOLSE-NEXT: stlxr w10, w9, [x0]
+; NOLSE-NEXT: cbnz w10, .LBB7_1
+; NOLSE-NEXT: // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT: fmov d0, x8
; NOLSE-NEXT: ret
;
; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
@@ -522,38 +376,22 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
; NOLSE: // %bb.0:
; NOLSE-NEXT: movi v1.4s, #1
; NOLSE-NEXT: movi v2.4s, #127, msl #8
-; NOLSE-NEXT: shll v3.4s, v0.4h, #16
-; NOLSE-NEXT: ldr s0, [x0]
-; NOLSE-NEXT: b .LBB8_2
+; NOLSE-NEXT: shll v0.4s, v0.4h, #16
; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1
-; NOLSE-NEXT: fmov s0, w10
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.eq .LBB8_5
-; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start
-; NOLSE-NEXT: // =>This Loop Header: Depth=1
-; NOLSE-NEXT: // Child Loop BB8_3 Depth 2
-; NOLSE-NEXT: shll v4.4s, v0.4h, #16
-; NOLSE-NEXT: fmov w9, s0
-; NOLSE-NEXT: fadd v4.4s, v4.4s, v3.4s
-; NOLSE-NEXT: ushr v5.4s, v4.4s, #16
-; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b
-; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s
-; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s
-; NOLSE-NEXT: fmov w8, s4
-; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start
-; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1
-; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxr w10, [x0]
-; NOLSE-NEXT: cmp w10, w9
-; NOLSE-NEXT: b.ne .LBB8_1
-; NOLSE-NEXT: // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2
-; NOLSE-NEXT: stlxr wzr, w8, [x0]
-; NOLSE-NEXT: cbnz wzr, .LBB8_3
-; NOLSE-NEXT: b .LBB8_1
-; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end
-; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT: ldaxr w8, [x0]
+; NOLSE-NEXT: fmov s3, w8
+; NOLSE-NEXT: shll ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/103702
More information about the llvm-branch-commits
mailing list