[llvm] AMDGPU: Add amdgpu.no.remote.memory when upgrading old atomic intrinsics (PR #89655)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue May 21 09:53:16 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/89655
>From 2cebcb7ad0dbf9a8708c1a2a2b7bb45443862423 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 21 May 2024 13:24:10 +0200
Subject: [PATCH 1/2] AArch64/ARM/PPC/X86: Add some atomic tests
FP typed atomic load/store coverage was mostly missing, especially
for half and bfloat.
---
.../Atomics/aarch64-atomic-load-lse2.ll | 113 ++++
.../CodeGen/AArch64/relaxed-fp-atomics.ll | 90 +++
llvm/test/CodeGen/ARM/atomic-load-store.ll | 536 ++++++++++++++++++
llvm/test/CodeGen/PowerPC/atomics.ll | 209 +++++++
llvm/test/CodeGen/X86/atomic-non-integer.ll | 97 ++++
5 files changed, 1045 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll
index e7e231bc344d9..3732d4feb0c67 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll
@@ -566,6 +566,119 @@ define dso_local i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %pt
%r = load atomic i128, ptr %ptr seq_cst, align 1
ret i128 %r
}
+
+define dso_local half @load_atomic_f16_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_unordered:
+; CHECK: ldrh w8, [x0]
+ %r = load atomic half, ptr %ptr unordered, align 2
+ ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_unordered_const:
+; CHECK: ldrh w8, [x0]
+ %r = load atomic half, ptr %ptr unordered, align 2
+ ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_monotonic:
+; CHECK: ldrh w8, [x0]
+ %r = load atomic half, ptr %ptr monotonic, align 2
+ ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_monotonic_const:
+; CHECK: ldrh w8, [x0]
+ %r = load atomic half, ptr %ptr monotonic, align 2
+ ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_acquire:
+; CHECK: ldarh w8, [x0]
+ %r = load atomic half, ptr %ptr acquire, align 2
+ ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_acquire_const:
+; CHECK: ldarh w8, [x0]
+ %r = load atomic half, ptr %ptr acquire, align 2
+ ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_seq_cst:
+; CHECK: ldarh w8, [x0]
+ %r = load atomic half, ptr %ptr seq_cst, align 2
+ ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_seq_cst_const:
+; CHECK: ldarh w8, [x0]
+ %r = load atomic half, ptr %ptr seq_cst, align 2
+ ret half %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_unordered:
+; CHECK: ldrh w8, [x0]
+ %r = load atomic bfloat, ptr %ptr unordered, align 2
+ ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_unordered_const:
+; CHECK: ldrh w8, [x0]
+ %r = load atomic bfloat, ptr %ptr unordered, align 2
+ ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_monotonic:
+; CHECK: ldrh w8, [x0]
+ %r = load atomic bfloat, ptr %ptr monotonic, align 2
+ ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_monotonic_const:
+; CHECK: ldrh w8, [x0]
+ %r = load atomic bfloat, ptr %ptr monotonic, align 2
+ ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_acquire:
+; CHECK: ldarh w8, [x0]
+ %r = load atomic bfloat, ptr %ptr acquire, align 2
+ ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_acquire_const:
+; CHECK: ldarh w8, [x0]
+ %r = load atomic bfloat, ptr %ptr acquire, align 2
+ ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst:
+; CHECK: ldarh w8, [x0]
+ %r = load atomic bfloat, ptr %ptr seq_cst, align 2
+ ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst_const:
+; CHECK: ldarh w8, [x0]
+ %r = load atomic bfloat, ptr %ptr seq_cst, align 2
+ ret bfloat %r
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; -O0: {{.*}}
; -O1: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll b/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll
index 95abbb6979be8..af664549a472a 100644
--- a/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll
+++ b/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll
@@ -91,4 +91,94 @@ define void @atomic_store_relaxed_f64(ptr %p, i32 %off32, i64 %off64, double %va
ret void
}
+define half @atomic_load_relaxed_f16(ptr %p, i32 %off32, i64 %off64) #0 {
+; CHECK-LABEL: atomic_load_relaxed_f16:
+ %ptr_unsigned = getelementptr half, ptr %p, i32 4095
+ %val_unsigned = load atomic half, ptr %ptr_unsigned monotonic, align 4
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+ %ptr_regoff = getelementptr half, ptr %p, i32 %off32
+ %val_regoff = load atomic half, ptr %ptr_regoff unordered, align 4
+ %tot1 = fadd half %val_unsigned, %val_regoff
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+ %ptr_regoff64 = getelementptr half, ptr %p, i64 %off64
+ %val_regoff64 = load atomic half, ptr %ptr_regoff64 monotonic, align 4
+ %tot2 = fadd half %tot1, %val_regoff64
+; CHECK: ldrh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+ %ptr_unscaled = getelementptr half, ptr %p, i32 -64
+ %val_unscaled = load atomic half, ptr %ptr_unscaled unordered, align 4
+ %tot3 = fadd half %tot2, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-128]
+
+ ret half %tot3
+}
+
+define bfloat @atomic_load_relaxed_bf16(ptr %p, i32 %off32, i64 %off64) #0 {
+; CHECK-LABEL: atomic_load_relaxed_bf16:
+ %ptr_unsigned = getelementptr bfloat, ptr %p, i32 4095
+ %val_unsigned = load atomic bfloat, ptr %ptr_unsigned monotonic, align 4
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+ %ptr_regoff = getelementptr bfloat, ptr %p, i32 %off32
+ %val_regoff = load atomic bfloat, ptr %ptr_regoff unordered, align 4
+ %tot1 = fadd bfloat %val_unsigned, %val_regoff
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+ %ptr_regoff64 = getelementptr bfloat, ptr %p, i64 %off64
+ %val_regoff64 = load atomic bfloat, ptr %ptr_regoff64 monotonic, align 4
+ %tot2 = fadd bfloat %tot1, %val_regoff64
+; CHECK: ldrh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+ %ptr_unscaled = getelementptr bfloat, ptr %p, i32 -64
+ %val_unscaled = load atomic bfloat, ptr %ptr_unscaled unordered, align 4
+ %tot3 = fadd bfloat %tot2, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-128]
+
+ ret bfloat %tot3
+}
+
+define void @atomic_store_relaxed_f16(ptr %p, i32 %off32, i64 %off64, half %val) #0 {
+; CHECK-LABEL: atomic_store_relaxed_f16:
+ %ptr_unsigned = getelementptr half, ptr %p, i32 4095
+ store atomic half %val, ptr %ptr_unsigned monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+ %ptr_regoff = getelementptr half, ptr %p, i32 %off32
+ store atomic half %val, ptr %ptr_regoff unordered, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+ %ptr_regoff64 = getelementptr half, ptr %p, i64 %off64
+ store atomic half %val, ptr %ptr_regoff64 monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+ %ptr_unscaled = getelementptr half, ptr %p, i32 -64
+ store atomic half %val, ptr %ptr_unscaled unordered, align 4
+; CHECK: sturh {{w[0-9]+}}, [x0, #-128]
+
+ ret void
+}
+
+define void @atomic_store_relaxed_bf16(ptr %p, i32 %off32, i64 %off64, bfloat %val) #0 {
+; CHECK-LABEL: atomic_store_relaxed_bf16:
+ %ptr_unsigned = getelementptr bfloat, ptr %p, i32 4095
+ store atomic bfloat %val, ptr %ptr_unsigned monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+ %ptr_regoff = getelementptr bfloat, ptr %p, i32 %off32
+ store atomic bfloat %val, ptr %ptr_regoff unordered, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+ %ptr_regoff64 = getelementptr bfloat, ptr %p, i64 %off64
+ store atomic bfloat %val, ptr %ptr_regoff64 monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+ %ptr_unscaled = getelementptr bfloat, ptr %p, i32 -64
+ store atomic bfloat %val, ptr %ptr_unscaled unordered, align 4
+; CHECK: sturh {{w[0-9]+}}, [x0, #-128]
+
+ ret void
+}
+
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 4f2e63b5f2467..c53fb2f330a79 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -439,3 +439,539 @@ define void @test_old_store_64bit(ptr %p, i64 %v) {
store atomic i64 %v, ptr %p seq_cst, align 8
ret void
}
+
+define half @load_atomic_f16__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_f16__seq_cst:
+; ARM: @ %bb.0:
+; ARM-NEXT: ldrh r0, [r0]
+; ARM-NEXT: dmb ish
+; ARM-NEXT: bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_f16__seq_cst:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT: ldrh r0, [r0]
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: bx lr
+;
+; THUMBTWO-LABEL: load_atomic_f16__seq_cst:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT: ldrh r0, [r0]
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: bx lr
+;
+; THUMBONE-LABEL: load_atomic_f16__seq_cst:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT: push {r7, lr}
+; THUMBONE-NEXT: movs r1, #0
+; THUMBONE-NEXT: mov r2, r1
+; THUMBONE-NEXT: bl __sync_val_compare_and_swap_2
+; THUMBONE-NEXT: pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_f16__seq_cst:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT: push {r11, lr}
+; ARMV4-NEXT: mov r1, #5
+; ARMV4-NEXT: bl __atomic_load_2
+; ARMV4-NEXT: pop {r11, lr}
+; ARMV4-NEXT: mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_f16__seq_cst:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT: ldrh r0, [r0]
+; ARMV6-NEXT: mov r1, #0
+; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT: bx lr
+;
+; THUMBM-LABEL: load_atomic_f16__seq_cst:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT: ldrh r0, [r0]
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: bx lr
+ %val = load atomic half, ptr %ptr seq_cst, align 2
+ ret half %val
+}
+
+define bfloat @load_atomic_bf16__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_bf16__seq_cst:
+; ARM: @ %bb.0:
+; ARM-NEXT: ldrh r0, [r0]
+; ARM-NEXT: dmb ish
+; ARM-NEXT: bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_bf16__seq_cst:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT: ldrh r0, [r0]
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: bx lr
+;
+; THUMBTWO-LABEL: load_atomic_bf16__seq_cst:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT: ldrh r0, [r0]
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: bx lr
+;
+; THUMBONE-LABEL: load_atomic_bf16__seq_cst:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT: push {r7, lr}
+; THUMBONE-NEXT: movs r1, #0
+; THUMBONE-NEXT: mov r2, r1
+; THUMBONE-NEXT: bl __sync_val_compare_and_swap_2
+; THUMBONE-NEXT: pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_bf16__seq_cst:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT: push {r11, lr}
+; ARMV4-NEXT: mov r1, #5
+; ARMV4-NEXT: bl __atomic_load_2
+; ARMV4-NEXT: pop {r11, lr}
+; ARMV4-NEXT: mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_bf16__seq_cst:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT: ldrh r0, [r0]
+; ARMV6-NEXT: mov r1, #0
+; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT: bx lr
+;
+; THUMBM-LABEL: load_atomic_bf16__seq_cst:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT: ldrh r0, [r0]
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: bx lr
+ %val = load atomic bfloat, ptr %ptr seq_cst, align 2
+ ret bfloat %val
+}
+
+define float @load_atomic_f32__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_f32__seq_cst:
+; ARM: @ %bb.0:
+; ARM-NEXT: ldr r0, [r0]
+; ARM-NEXT: dmb ish
+; ARM-NEXT: bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_f32__seq_cst:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT: ldr r0, [r0]
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: vmov s0, r0
+; ARMOPTNONE-NEXT: bx lr
+;
+; THUMBTWO-LABEL: load_atomic_f32__seq_cst:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT: ldr r0, [r0]
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: bx lr
+;
+; THUMBONE-LABEL: load_atomic_f32__seq_cst:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT: push {r7, lr}
+; THUMBONE-NEXT: movs r1, #0
+; THUMBONE-NEXT: mov r2, r1
+; THUMBONE-NEXT: bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT: pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_f32__seq_cst:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT: push {r11, lr}
+; ARMV4-NEXT: mov r1, #5
+; ARMV4-NEXT: bl __atomic_load_4
+; ARMV4-NEXT: pop {r11, lr}
+; ARMV4-NEXT: mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_f32__seq_cst:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT: ldr r0, [r0]
+; ARMV6-NEXT: mov r1, #0
+; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT: bx lr
+;
+; THUMBM-LABEL: load_atomic_f32__seq_cst:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT: ldr r0, [r0]
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: bx lr
+ %val = load atomic float, ptr %ptr seq_cst, align 4
+ ret float %val
+}
+
+define double @load_atomic_f64__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_f64__seq_cst:
+; ARM: @ %bb.0:
+; ARM-NEXT: ldrexd r0, r1, [r0]
+; ARM-NEXT: clrex
+; ARM-NEXT: dmb ish
+; ARM-NEXT: bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_f64__seq_cst:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT: ldrexd r2, r3, [r0]
+; ARMOPTNONE-NEXT: mov r1, r3
+; ARMOPTNONE-NEXT: mov r0, r2
+; ARMOPTNONE-NEXT: clrex
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: vmov d16, r0, r1
+; ARMOPTNONE-NEXT: bx lr
+;
+; THUMBTWO-LABEL: load_atomic_f64__seq_cst:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT: ldrexd r0, r1, [r0]
+; THUMBTWO-NEXT: clrex
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: bx lr
+;
+; THUMBONE-LABEL: load_atomic_f64__seq_cst:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT: push {r7, lr}
+; THUMBONE-NEXT: sub sp, #8
+; THUMBONE-NEXT: movs r2, #0
+; THUMBONE-NEXT: str r2, [sp]
+; THUMBONE-NEXT: str r2, [sp, #4]
+; THUMBONE-NEXT: mov r3, r2
+; THUMBONE-NEXT: bl __sync_val_compare_and_swap_8
+; THUMBONE-NEXT: add sp, #8
+; THUMBONE-NEXT: pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_f64__seq_cst:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT: push {r11, lr}
+; ARMV4-NEXT: mov r1, #5
+; ARMV4-NEXT: bl __atomic_load_8
+; ARMV4-NEXT: pop {r11, lr}
+; ARMV4-NEXT: mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_f64__seq_cst:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT: ldrexd r0, r1, [r0]
+; ARMV6-NEXT: mov r2, #0
+; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT: bx lr
+;
+; THUMBM-LABEL: load_atomic_f64__seq_cst:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT: push {r7, lr}
+; THUMBM-NEXT: movs r1, #5
+; THUMBM-NEXT: bl __atomic_load_8
+; THUMBM-NEXT: pop {r7, pc}
+ %val = load atomic double, ptr %ptr seq_cst, align 8
+ ret double %val
+}
+
+define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) {
+; ARM-LABEL: store_atomic_f16__seq_cst:
+; ARM: @ %bb.0:
+; ARM-NEXT: dmb ish
+; ARM-NEXT: strh r1, [r0]
+; ARM-NEXT: dmb ish
+; ARM-NEXT: bx lr
+;
+; ARMOPTNONE-LABEL: store_atomic_f16__seq_cst:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT: sub sp, sp, #4
+; ARMOPTNONE-NEXT: str r1, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT: mov r1, r0
+; ARMOPTNONE-NEXT: ldr r0, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT: vmov s0, r0
+; ARMOPTNONE-NEXT: vmov r0, s0
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: strh r0, [r1]
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: add sp, sp, #4
+; ARMOPTNONE-NEXT: bx lr
+;
+; THUMBTWO-LABEL: store_atomic_f16__seq_cst:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: strh r1, [r0]
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: bx lr
+;
+; THUMBONE-LABEL: store_atomic_f16__seq_cst:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT: push {r7, lr}
+; THUMBONE-NEXT: bl __sync_lock_test_and_set_2
+; THUMBONE-NEXT: pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_f16__seq_cst:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT: push {r11, lr}
+; ARMV4-NEXT: mov r2, #5
+; ARMV4-NEXT: bl __atomic_store_2
+; ARMV4-NEXT: pop {r11, lr}
+; ARMV4-NEXT: mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_f16__seq_cst:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT: mov r2, #0
+; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT: strh r1, [r0]
+; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT: bx lr
+;
+; THUMBM-LABEL: store_atomic_f16__seq_cst:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: strh r1, [r0]
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: bx lr
+ store atomic half %val1, ptr %ptr seq_cst, align 2
+ ret void
+}
+
+define void @store_atomic_bf16__seq_cst(ptr %ptr, bfloat %val1) {
+; ARM-LABEL: store_atomic_bf16__seq_cst:
+; ARM: @ %bb.0:
+; ARM-NEXT: dmb ish
+; ARM-NEXT: strh r1, [r0]
+; ARM-NEXT: dmb ish
+; ARM-NEXT: bx lr
+;
+; ARMOPTNONE-LABEL: store_atomic_bf16__seq_cst:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT: sub sp, sp, #4
+; ARMOPTNONE-NEXT: str r1, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT: mov r1, r0
+; ARMOPTNONE-NEXT: ldr r0, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT: vmov s0, r0
+; ARMOPTNONE-NEXT: vmov r0, s0
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: strh r0, [r1]
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: add sp, sp, #4
+; ARMOPTNONE-NEXT: bx lr
+;
+; THUMBTWO-LABEL: store_atomic_bf16__seq_cst:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: strh r1, [r0]
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: bx lr
+;
+; THUMBONE-LABEL: store_atomic_bf16__seq_cst:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT: push {r7, lr}
+; THUMBONE-NEXT: bl __sync_lock_test_and_set_2
+; THUMBONE-NEXT: pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_bf16__seq_cst:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT: push {r11, lr}
+; ARMV4-NEXT: mov r2, #5
+; ARMV4-NEXT: bl __atomic_store_2
+; ARMV4-NEXT: pop {r11, lr}
+; ARMV4-NEXT: mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_bf16__seq_cst:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT: mov r2, #0
+; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT: strh r1, [r0]
+; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT: bx lr
+;
+; THUMBM-LABEL: store_atomic_bf16__seq_cst:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: strh r1, [r0]
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: bx lr
+ store atomic bfloat %val1, ptr %ptr seq_cst, align 2
+ ret void
+}
+
+define void @store_atomic_f32__seq_cst(ptr %ptr, float %val1) {
+; ARM-LABEL: store_atomic_f32__seq_cst:
+; ARM: @ %bb.0:
+; ARM-NEXT: dmb ish
+; ARM-NEXT: str r1, [r0]
+; ARM-NEXT: dmb ish
+; ARM-NEXT: bx lr
+;
+; ARMOPTNONE-LABEL: store_atomic_f32__seq_cst:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT: sub sp, sp, #4
+; ARMOPTNONE-NEXT: str r1, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT: mov r1, r0
+; ARMOPTNONE-NEXT: ldr r0, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT: vmov s0, r0
+; ARMOPTNONE-NEXT: vmov r0, s0
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: str r0, [r1]
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: add sp, sp, #4
+; ARMOPTNONE-NEXT: bx lr
+;
+; THUMBTWO-LABEL: store_atomic_f32__seq_cst:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: str r1, [r0]
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: bx lr
+;
+; THUMBONE-LABEL: store_atomic_f32__seq_cst:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT: push {r7, lr}
+; THUMBONE-NEXT: bl __sync_lock_test_and_set_4
+; THUMBONE-NEXT: pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_f32__seq_cst:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT: push {r11, lr}
+; ARMV4-NEXT: mov r2, #5
+; ARMV4-NEXT: bl __atomic_store_4
+; ARMV4-NEXT: pop {r11, lr}
+; ARMV4-NEXT: mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_f32__seq_cst:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT: mov r2, #0
+; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT: str r1, [r0]
+; ARMV6-NEXT: mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT: bx lr
+;
+; THUMBM-LABEL: store_atomic_f32__seq_cst:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: str r1, [r0]
+; THUMBM-NEXT: dmb sy
+; THUMBM-NEXT: bx lr
+ store atomic float %val1, ptr %ptr seq_cst, align 4
+ ret void
+}
+
+define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
+; ARM-LABEL: store_atomic_f64__seq_cst:
+; ARM: @ %bb.0:
+; ARM-NEXT: push {r4, r5, lr}
+; ARM-NEXT: mov r3, r2
+; ARM-NEXT: dmb ish
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: LBB13_1: @ %atomicrmw.start
+; ARM-NEXT: @ =>This Inner Loop Header: Depth=1
+; ARM-NEXT: ldrexd r4, r5, [r0]
+; ARM-NEXT: strexd r1, r2, r3, [r0]
+; ARM-NEXT: cmp r1, #0
+; ARM-NEXT: bne LBB13_1
+; ARM-NEXT: @ %bb.2: @ %atomicrmw.end
+; ARM-NEXT: dmb ish
+; ARM-NEXT: pop {r4, r5, pc}
+;
+; ARMOPTNONE-LABEL: store_atomic_f64__seq_cst:
+; ARMOPTNONE: @ %bb.0:
+; ARMOPTNONE-NEXT: push {r4, r5, r7, lr}
+; ARMOPTNONE-NEXT: add r7, sp, #8
+; ARMOPTNONE-NEXT: push {r8, r10, r11}
+; ARMOPTNONE-NEXT: sub sp, sp, #20
+; ARMOPTNONE-NEXT: str r0, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT: vmov d16, r1, r2
+; ARMOPTNONE-NEXT: vmov r1, r2, d16
+; ARMOPTNONE-NEXT: str r2, [sp, #4] @ 4-byte Spill
+; ARMOPTNONE-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: ldr r1, [r0]
+; ARMOPTNONE-NEXT: ldr r0, [r0, #4]
+; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARMOPTNONE-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARMOPTNONE-NEXT: b LBB13_1
+; ARMOPTNONE-NEXT: LBB13_1: @ %atomicrmw.start
+; ARMOPTNONE-NEXT: @ =>This Loop Header: Depth=1
+; ARMOPTNONE-NEXT: @ Child Loop BB13_2 Depth 2
+; ARMOPTNONE-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; ARMOPTNONE-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; ARMOPTNONE-NEXT: ldr r3, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARMOPTNONE-NEXT: ldr r10, [sp, #8] @ 4-byte Reload
+; ARMOPTNONE-NEXT: @ kill: def $r10 killed $r10 def $r10_r11
+; ARMOPTNONE-NEXT: mov r11, r0
+; ARMOPTNONE-NEXT: mov r8, r2
+; ARMOPTNONE-NEXT: mov r9, r1
+; ARMOPTNONE-NEXT: LBB13_2: @ %atomicrmw.start
+; ARMOPTNONE-NEXT: @ Parent Loop BB13_1 Depth=1
+; ARMOPTNONE-NEXT: @ => This Inner Loop Header: Depth=2
+; ARMOPTNONE-NEXT: ldrexd r4, r5, [r3]
+; ARMOPTNONE-NEXT: cmp r4, r8
+; ARMOPTNONE-NEXT: cmpeq r5, r9
+; ARMOPTNONE-NEXT: bne LBB13_4
+; ARMOPTNONE-NEXT: @ %bb.3: @ %atomicrmw.start
+; ARMOPTNONE-NEXT: @ in Loop: Header=BB13_2 Depth=2
+; ARMOPTNONE-NEXT: strexd r0, r10, r11, [r3]
+; ARMOPTNONE-NEXT: cmp r0, #0
+; ARMOPTNONE-NEXT: bne LBB13_2
+; ARMOPTNONE-NEXT: LBB13_4: @ %atomicrmw.start
+; ARMOPTNONE-NEXT: @ in Loop: Header=BB13_1 Depth=1
+; ARMOPTNONE-NEXT: mov r0, r5
+; ARMOPTNONE-NEXT: eor r3, r0, r1
+; ARMOPTNONE-NEXT: mov r1, r4
+; ARMOPTNONE-NEXT: eor r2, r1, r2
+; ARMOPTNONE-NEXT: orr r2, r2, r3
+; ARMOPTNONE-NEXT: cmp r2, #0
+; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARMOPTNONE-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARMOPTNONE-NEXT: bne LBB13_1
+; ARMOPTNONE-NEXT: b LBB13_5
+; ARMOPTNONE-NEXT: LBB13_5: @ %atomicrmw.end
+; ARMOPTNONE-NEXT: dmb ish
+; ARMOPTNONE-NEXT: sub sp, r7, #20
+; ARMOPTNONE-NEXT: pop {r8, r10, r11}
+; ARMOPTNONE-NEXT: pop {r4, r5, r7, pc}
+;
+; THUMBTWO-LABEL: store_atomic_f64__seq_cst:
+; THUMBTWO: @ %bb.0:
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: LBB13_1: @ %atomicrmw.start
+; THUMBTWO-NEXT: @ =>This Inner Loop Header: Depth=1
+; THUMBTWO-NEXT: ldrexd r3, r9, [r0]
+; THUMBTWO-NEXT: strexd r3, r1, r2, [r0]
+; THUMBTWO-NEXT: cmp r3, #0
+; THUMBTWO-NEXT: bne LBB13_1
+; THUMBTWO-NEXT: @ %bb.2: @ %atomicrmw.end
+; THUMBTWO-NEXT: dmb ish
+; THUMBTWO-NEXT: bx lr
+;
+; THUMBONE-LABEL: store_atomic_f64__seq_cst:
+; THUMBONE: @ %bb.0:
+; THUMBONE-NEXT: push {r7, lr}
+; THUMBONE-NEXT: bl __sync_lock_test_and_set_8
+; THUMBONE-NEXT: pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_f64__seq_cst:
+; ARMV4: @ %bb.0:
+; ARMV4-NEXT: push {r11, lr}
+; ARMV4-NEXT: sub sp, sp, #8
+; ARMV4-NEXT: mov r1, #5
+; ARMV4-NEXT: str r1, [sp]
+; ARMV4-NEXT: bl __atomic_store_8
+; ARMV4-NEXT: add sp, sp, #8
+; ARMV4-NEXT: pop {r11, lr}
+; ARMV4-NEXT: mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_f64__seq_cst:
+; ARMV6: @ %bb.0:
+; ARMV6-NEXT: push {r4, r5, r11, lr}
+; ARMV6-NEXT: @ kill: def $r3 killed $r3 killed $r2_r3 def $r2_r3
+; ARMV6-NEXT: mov r1, #0
+; ARMV6-NEXT: @ kill: def $r2 killed $r2 killed $r2_r3 def $r2_r3
+; ARMV6-NEXT: mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT: .LBB13_1: @ %atomicrmw.start
+; ARMV6-NEXT: @ =>This Inner Loop Header: Depth=1
+; ARMV6-NEXT: ldrexd r4, r5, [r0]
+; ARMV6-NEXT: strexd r1, r2, r3, [r0]
+; ARMV6-NEXT: cmp r1, #0
+; ARMV6-NEXT: bne .LBB13_1
+; ARMV6-NEXT: @ %bb.2: @ %atomicrmw.end
+; ARMV6-NEXT: mov r0, #0
+; ARMV6-NEXT: mcr p15, #0, r0, c7, c10, #5
+; ARMV6-NEXT: pop {r4, r5, r11, pc}
+;
+; THUMBM-LABEL: store_atomic_f64__seq_cst:
+; THUMBM: @ %bb.0:
+; THUMBM-NEXT: push {r7, lr}
+; THUMBM-NEXT: sub sp, #8
+; THUMBM-NEXT: movs r1, #5
+; THUMBM-NEXT: str r1, [sp]
+; THUMBM-NEXT: bl __atomic_store_8
+; THUMBM-NEXT: add sp, #8
+; THUMBM-NEXT: pop {r7, pc}
+ store atomic double %val1, ptr %ptr seq_cst, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll
index 04cdbe9d7e785..ff5bec53acd25 100644
--- a/llvm/test/CodeGen/PowerPC/atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics.ll
@@ -462,3 +462,212 @@ define i64 @and_i64_release(ptr %mem, i64 %operand) {
%val = atomicrmw and ptr %mem, i64 %operand release
ret i64 %val
}
+
+define half @load_atomic_f16__seq_cst(ptr %ptr) {
+; PPC32-LABEL: load_atomic_f16__seq_cst:
+; PPC32: # %bb.0:
+; PPC32-NEXT: mflr r0
+; PPC32-NEXT: stwu r1, -16(r1)
+; PPC32-NEXT: stw r0, 20(r1)
+; PPC32-NEXT: .cfi_def_cfa_offset 16
+; PPC32-NEXT: .cfi_offset lr, 4
+; PPC32-NEXT: sync
+; PPC32-NEXT: lhz r3, 0(r3)
+; PPC32-NEXT: cmpw cr7, r3, r3
+; PPC32-NEXT: bne- cr7, .+4
+; PPC32-NEXT: isync
+; PPC32-NEXT: bl __gnu_h2f_ieee
+; PPC32-NEXT: lwz r0, 20(r1)
+; PPC32-NEXT: addi r1, r1, 16
+; PPC32-NEXT: mtlr r0
+; PPC32-NEXT: blr
+;
+; PPC64-LABEL: load_atomic_f16__seq_cst:
+; PPC64: # %bb.0:
+; PPC64-NEXT: mflr r0
+; PPC64-NEXT: stdu r1, -112(r1)
+; PPC64-NEXT: std r0, 128(r1)
+; PPC64-NEXT: .cfi_def_cfa_offset 112
+; PPC64-NEXT: .cfi_offset lr, 16
+; PPC64-NEXT: sync
+; PPC64-NEXT: lhz r3, 0(r3)
+; PPC64-NEXT: cmpd cr7, r3, r3
+; PPC64-NEXT: bne- cr7, .+4
+; PPC64-NEXT: isync
+; PPC64-NEXT: bl __gnu_h2f_ieee
+; PPC64-NEXT: nop
+; PPC64-NEXT: addi r1, r1, 112
+; PPC64-NEXT: ld r0, 16(r1)
+; PPC64-NEXT: mtlr r0
+; PPC64-NEXT: blr
+ %val = load atomic half, ptr %ptr seq_cst, align 2
+ ret half %val
+}
+
+; FIXME: bf16_to_fp fails to select
+; define bfloat @load_atomic_bf16__seq_cst(ptr %ptr) {
+; %val = load atomic bfloat, ptr %ptr seq_cst, align 2
+; ret bfloat %val
+; }
+
+define float @load_atomic_f32__seq_cst(ptr %ptr) {
+; PPC32-LABEL: load_atomic_f32__seq_cst:
+; PPC32: # %bb.0:
+; PPC32-NEXT: stwu r1, -16(r1)
+; PPC32-NEXT: .cfi_def_cfa_offset 16
+; PPC32-NEXT: sync
+; PPC32-NEXT: lwz r3, 0(r3)
+; PPC32-NEXT: cmpw cr7, r3, r3
+; PPC32-NEXT: bne- cr7, .+4
+; PPC32-NEXT: isync
+; PPC32-NEXT: stw r3, 12(r1)
+; PPC32-NEXT: lfs f1, 12(r1)
+; PPC32-NEXT: addi r1, r1, 16
+; PPC32-NEXT: blr
+;
+; PPC64-LABEL: load_atomic_f32__seq_cst:
+; PPC64: # %bb.0:
+; PPC64-NEXT: sync
+; PPC64-NEXT: lwz r3, 0(r3)
+; PPC64-NEXT: cmpd cr7, r3, r3
+; PPC64-NEXT: bne- cr7, .+4
+; PPC64-NEXT: isync
+; PPC64-NEXT: stw r3, -4(r1)
+; PPC64-NEXT: lfs f1, -4(r1)
+; PPC64-NEXT: blr
+ %val = load atomic float, ptr %ptr seq_cst, align 4
+ ret float %val
+}
+
+define double @load_atomic_f64__seq_cst(ptr %ptr) {
+; PPC32-LABEL: load_atomic_f64__seq_cst:
+; PPC32: # %bb.0:
+; PPC32-NEXT: mflr r0
+; PPC32-NEXT: stwu r1, -16(r1)
+; PPC32-NEXT: stw r0, 20(r1)
+; PPC32-NEXT: .cfi_def_cfa_offset 16
+; PPC32-NEXT: .cfi_offset lr, 4
+; PPC32-NEXT: li r4, 5
+; PPC32-NEXT: bl __atomic_load_8
+; PPC32-NEXT: stw r4, 12(r1)
+; PPC32-NEXT: stw r3, 8(r1)
+; PPC32-NEXT: lfd f1, 8(r1)
+; PPC32-NEXT: lwz r0, 20(r1)
+; PPC32-NEXT: addi r1, r1, 16
+; PPC32-NEXT: mtlr r0
+; PPC32-NEXT: blr
+;
+; PPC64-LABEL: load_atomic_f64__seq_cst:
+; PPC64: # %bb.0:
+; PPC64-NEXT: sync
+; PPC64-NEXT: ld r3, 0(r3)
+; PPC64-NEXT: cmpd cr7, r3, r3
+; PPC64-NEXT: bne- cr7, .+4
+; PPC64-NEXT: isync
+; PPC64-NEXT: std r3, -8(r1)
+; PPC64-NEXT: lfd f1, -8(r1)
+; PPC64-NEXT: blr
+ %val = load atomic double, ptr %ptr seq_cst, align 8
+ ret double %val
+}
+
+define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) {
+; PPC32-LABEL: store_atomic_f16__seq_cst:
+; PPC32: # %bb.0:
+; PPC32-NEXT: mflr r0
+; PPC32-NEXT: stwu r1, -16(r1)
+; PPC32-NEXT: stw r0, 20(r1)
+; PPC32-NEXT: .cfi_def_cfa_offset 16
+; PPC32-NEXT: .cfi_offset lr, 4
+; PPC32-NEXT: .cfi_offset r30, -8
+; PPC32-NEXT: stw r30, 8(r1) # 4-byte Folded Spill
+; PPC32-NEXT: mr r30, r3
+; PPC32-NEXT: bl __gnu_f2h_ieee
+; PPC32-NEXT: sync
+; PPC32-NEXT: sth r3, 0(r30)
+; PPC32-NEXT: lwz r30, 8(r1) # 4-byte Folded Reload
+; PPC32-NEXT: lwz r0, 20(r1)
+; PPC32-NEXT: addi r1, r1, 16
+; PPC32-NEXT: mtlr r0
+; PPC32-NEXT: blr
+;
+; PPC64-LABEL: store_atomic_f16__seq_cst:
+; PPC64: # %bb.0:
+; PPC64-NEXT: mflr r0
+; PPC64-NEXT: stdu r1, -128(r1)
+; PPC64-NEXT: std r0, 144(r1)
+; PPC64-NEXT: .cfi_def_cfa_offset 128
+; PPC64-NEXT: .cfi_offset lr, 16
+; PPC64-NEXT: .cfi_offset r30, -16
+; PPC64-NEXT: std r30, 112(r1) # 8-byte Folded Spill
+; PPC64-NEXT: mr r30, r3
+; PPC64-NEXT: bl __gnu_f2h_ieee
+; PPC64-NEXT: nop
+; PPC64-NEXT: sync
+; PPC64-NEXT: sth r3, 0(r30)
+; PPC64-NEXT: ld r30, 112(r1) # 8-byte Folded Reload
+; PPC64-NEXT: addi r1, r1, 128
+; PPC64-NEXT: ld r0, 16(r1)
+; PPC64-NEXT: mtlr r0
+; PPC64-NEXT: blr
+ store atomic half %val1, ptr %ptr seq_cst, align 2
+ ret void
+}
+
+; FIXME: bf16_to_fp fails to select
+; define void @store_atomic_bf16__seq_cst(ptr %ptr, bfloat %val1) {
+; store atomic bfloat %val1, ptr %ptr seq_cst, align 2
+; ret void
+; }
+
+define void @store_atomic_f32__seq_cst(ptr %ptr, float %val1) {
+; PPC32-LABEL: store_atomic_f32__seq_cst:
+; PPC32: # %bb.0:
+; PPC32-NEXT: stwu r1, -16(r1)
+; PPC32-NEXT: .cfi_def_cfa_offset 16
+; PPC32-NEXT: stfs f1, 12(r1)
+; PPC32-NEXT: lwz r4, 12(r1)
+; PPC32-NEXT: sync
+; PPC32-NEXT: stw r4, 0(r3)
+; PPC32-NEXT: addi r1, r1, 16
+; PPC32-NEXT: blr
+;
+; PPC64-LABEL: store_atomic_f32__seq_cst:
+; PPC64: # %bb.0:
+; PPC64-NEXT: stfs f1, -4(r1)
+; PPC64-NEXT: lwz r4, -4(r1)
+; PPC64-NEXT: sync
+; PPC64-NEXT: stw r4, 0(r3)
+; PPC64-NEXT: blr
+ store atomic float %val1, ptr %ptr seq_cst, align 4
+ ret void
+}
+
+define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
+; PPC32-LABEL: store_atomic_f64__seq_cst:
+; PPC32: # %bb.0:
+; PPC32-NEXT: mflr r0
+; PPC32-NEXT: stwu r1, -16(r1)
+; PPC32-NEXT: stw r0, 20(r1)
+; PPC32-NEXT: .cfi_def_cfa_offset 16
+; PPC32-NEXT: .cfi_offset lr, 4
+; PPC32-NEXT: stfd f1, 8(r1)
+; PPC32-NEXT: li r7, 5
+; PPC32-NEXT: lwz r5, 8(r1)
+; PPC32-NEXT: lwz r6, 12(r1)
+; PPC32-NEXT: bl __atomic_store_8
+; PPC32-NEXT: lwz r0, 20(r1)
+; PPC32-NEXT: addi r1, r1, 16
+; PPC32-NEXT: mtlr r0
+; PPC32-NEXT: blr
+;
+; PPC64-LABEL: store_atomic_f64__seq_cst:
+; PPC64: # %bb.0:
+; PPC64-NEXT: stfd f1, -8(r1)
+; PPC64-NEXT: ld r4, -8(r1)
+; PPC64-NEXT: sync
+; PPC64-NEXT: std r4, 0(r3)
+; PPC64-NEXT: blr
+ store atomic double %val1, ptr %ptr seq_cst, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index 9995e7d3a4d31..d7633cb11e44c 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -787,3 +787,100 @@ define double @load_double_seq_cst(ptr %fptr) {
%v = load atomic double, ptr %fptr seq_cst, align 8
ret double %v
}
+
+define void @store_bfloat(ptr %fptr, bfloat %v) {
+; X86-LABEL: store_bfloat:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movw %cx, (%eax)
+; X86-NEXT: retl
+;
+; X64-SSE-LABEL: store_bfloat:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: pextrw $0, %xmm0, %eax
+; X64-SSE-NEXT: movw %ax, (%rdi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: store_bfloat:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpextrw $0, %xmm0, %eax
+; X64-AVX-NEXT: movw %ax, (%rdi)
+; X64-AVX-NEXT: retq
+ store atomic bfloat %v, ptr %fptr unordered, align 2
+ ret void
+}
+
+; Work around issue #92899 by casting to float
+define float @load_bfloat(ptr %fptr) {
+; X86-SSE1-LABEL: load_bfloat:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl %eax
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT: movzwl (%eax), %eax
+; X86-SSE1-NEXT: shll $16, %eax
+; X86-SSE1-NEXT: movl %eax, (%esp)
+; X86-SSE1-NEXT: flds (%esp)
+; X86-SSE1-NEXT: popl %eax
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT: retl
+;
+; X86-SSE2-LABEL: load_bfloat:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %eax
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movzwl (%eax), %eax
+; X86-SSE2-NEXT: shll $16, %eax
+; X86-SSE2-NEXT: movd %eax, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, (%esp)
+; X86-SSE2-NEXT: flds (%esp)
+; X86-SSE2-NEXT: popl %eax
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 4
+; X86-SSE2-NEXT: retl
+;
+; X86-AVX-LABEL: load_bfloat:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: pushl %eax
+; X86-AVX-NEXT: .cfi_def_cfa_offset 8
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movzwl (%eax), %eax
+; X86-AVX-NEXT: shll $16, %eax
+; X86-AVX-NEXT: vmovd %eax, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, (%esp)
+; X86-AVX-NEXT: flds (%esp)
+; X86-AVX-NEXT: popl %eax
+; X86-AVX-NEXT: .cfi_def_cfa_offset 4
+; X86-AVX-NEXT: retl
+;
+; X86-NOSSE-LABEL: load_bfloat:
+; X86-NOSSE: # %bb.0:
+; X86-NOSSE-NEXT: pushl %eax
+; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT: movzwl (%eax), %eax
+; X86-NOSSE-NEXT: shll $16, %eax
+; X86-NOSSE-NEXT: movl %eax, (%esp)
+; X86-NOSSE-NEXT: flds (%esp)
+; X86-NOSSE-NEXT: popl %eax
+; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4
+; X86-NOSSE-NEXT: retl
+;
+; X64-SSE-LABEL: load_bfloat:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: movzwl (%rdi), %eax
+; X64-SSE-NEXT: shll $16, %eax
+; X64-SSE-NEXT: movd %eax, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: load_bfloat:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: movzwl (%rdi), %eax
+; X64-AVX-NEXT: shll $16, %eax
+; X64-AVX-NEXT: vmovd %eax, %xmm0
+; X64-AVX-NEXT: retq
+ %v = load atomic bfloat, ptr %fptr unordered, align 2
+ %ext = fpext bfloat %v to float
+ ret float %ext
+}
>From 01f49144a01ac3452c94aee8e3f5460e985ab5dc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 22 Apr 2024 21:17:47 +0200
Subject: [PATCH 2/2] AMDGPU: Add amdgpu.no.remote.memory when upgrading old
atomic intrinsics
This should replicate the old intrinsic behavior better when codegen
of the raw instruction will require metadata in the future.
---
llvm/lib/IR/AutoUpgrade.cpp | 2 ++
llvm/test/Bitcode/amdgcn-atomic.ll | 40 +++++++++++++++---------------
2 files changed, 22 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a7ed2de6e8a5f..8b2cb97d12cf1 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -2358,6 +2358,8 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
SyncScope::ID SSID = F->getContext().getOrInsertSyncScopeID("agent");
AtomicRMWInst *RMW =
Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order, SSID);
+ RMW->setMetadata("amdgpu.no.remote.memory",
+ MDNode::get(F->getContext(), {}));
if (!VolatileArg || !VolatileArg->isZero())
RMW->setVolatile(true);
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index 2e6286a7df8df..1abba570f01e1 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -2,13 +2,13 @@
define void @atomic_inc(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
- ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
- ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
- ; CHECK: atomicrmw uinc_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
; CHECK: atomicrmw uinc_wrap ptr %ptr0, i64 48 syncscope("agent") seq_cst, align 8
@@ -26,13 +26,13 @@ define void @atomic_inc(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr
}
define void @atomic_dec(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
- ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
- ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
- ; CHECK: atomicrmw udec_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw udec_wrap ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
; CHECK: atomicrmw udec_wrap ptr %ptr0, i64 48 syncscope("agent") seq_cst, align 8
@@ -51,49 +51,49 @@ define void @atomic_dec(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr
; Test some invalid ordering handling
define void @ordering(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
- ; CHECK: atomicrmw volatile uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw volatile uinc_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true)
- ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true)
- ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false)
- ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4
+ ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4, !amdgpu.no.remote.memory !0
%result3 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true)
- ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result4 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false)
- ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result5 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true)
- ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result6 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false)
- ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result7 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true)
- ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result8 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false)
- ; CHECK:= atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ ; CHECK:= atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result9 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true)
- ; CHECK:= atomicrmw volatile udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ ; CHECK:= atomicrmw volatile udec_wrap ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result10 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true)
ret void
}
define void @immarg_violations(ptr %ptr0, i32 %val32, i1 %val1) {
- ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0
%result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false)
-; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4, !amdgpu.no.remote.memory !0
%result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false)
- ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+ ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4, !amdgpu.no.remote.memory !0
%result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1)
ret void
}
More information about the llvm-commits
mailing list