[llvm] 876816f - [AArch64] Set MaxAtomicSizeInBitsSupported. (#74385)

Mon Dec 11 14:55:12 PST 2023

Author: James Y Knight
Date: 2023-12-11T17:55:07-05:00
New Revision: 876816ff183b6fe4e23ba2a01dfd9cd0c52bf056

URL: https://github.com/llvm/llvm-project/commit/876816ff183b6fe4e23ba2a01dfd9cd0c52bf056
DIFF: https://github.com/llvm/llvm-project/commit/876816ff183b6fe4e23ba2a01dfd9cd0c52bf056.diff

LOG: [AArch64] Set MaxAtomicSizeInBitsSupported. (#74385)

This will result in larger atomic operations getting expanded to
`__atomic_*` libcalls via AtomicExpandPass, which matches what Clang
already does in the frontend.

Additionally, adjust some comments, and remove partial code dealing with
larger-than-128bit atomics, as it's now unreachable.

AArch64 always supports 128-bit atomics, so there's no conditionals
needed here. (Though: we really ought to require that a 128-bit load is
available, not just a cmpxchg, which would mean conditioning on LSE2.
But that's future work.)

The arm64-irtranslator.ll test was adjusted as it was using an i258 type
as a hack to avoid IR atomic lowering to test GlobalISel behavior. Pass
-mattr=+lse and use i32, instead, to accomplish that goal in a way that
continues to work.

Added: 
    llvm/test/CodeGen/AArch64/atomic-oversize.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bc74d44a8fbc40..3882e843fb69b8 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1651,6 +1651,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 
   IsStrictFPEnabled = true;
+  setMaxAtomicSizeInBitsSupported(128);
 }
 
 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
@@ -24909,15 +24910,21 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
                              : AtomicExpansionKind::LLSC;
 }
 
-// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+// The "default" for integer RMW operations is to expand to an LL/SC loop.
+// However, with the LSE instructions (or outline-atomics mode, which provides
+// library routines in place of the LSE-instructions), we can directly emit many
+// operations instead.
+//
+// Floating-point operations are always emitted to a cmpxchg loop, because they
+// may trigger a trap which aborts an LLSC sequence.
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
+
   if (AI->isFloatingPointOperation())
     return AtomicExpansionKind::CmpXChg;
 
-  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  if (Size > 128) return AtomicExpansionKind::None;
-
   bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
                       (AI->getOperation() == AtomicRMWInst::Xchg ||
                        AI->getOperation() == AtomicRMWInst::Or ||

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 575cd6b874e356..92ddc6309546f3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -aarch64-enable-atomic-cfg-tidy=0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
-; RUN: llc -O3 -aarch64-enable-atomic-cfg-tidy=0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=O3
+; RUN: llc -O0 -aarch64-enable-atomic-cfg-tidy=0 -mattr=+lse -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
+; RUN: llc -O3 -aarch64-enable-atomic-cfg-tidy=0 -mattr=+lse -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=O3
 
 ; This file checks that the translation from llvm IR to generic MachineInstr
 ; is correct.
@@ -2077,190 +2077,147 @@ done:
 }
 
 ; Try a monotonic atomicrmw xchg
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_xchg(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_xchg
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_XCHG [[ADDR]](p0), [[VAL]] :: (load store monotonic (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw xchg ptr %addr, i256 1 monotonic
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_XCHG [[ADDR]](p0), [[VAL]] :: (load store monotonic (s32) on %ir.addr)
+  %oldval = atomicrmw xchg ptr %addr, i32 1 monotonic
+  ret i32 %oldval
 }
 
 ; Try an acquire atomicrmw add
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_add(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_add
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_ADD [[ADDR]](p0), [[VAL]] :: (load store acquire (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw add ptr %addr, i256 1 acquire
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_ADD [[ADDR]](p0), [[VAL]] :: (load store acquire (s32) on %ir.addr)
+  %oldval = atomicrmw add ptr %addr, i32 1 acquire
+  ret i32 %oldval
 }
 
 ; Try a release atomicrmw sub
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_sub(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_sub
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_SUB [[ADDR]](p0), [[VAL]] :: (load store release (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw sub ptr %addr, i256 1 release
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_SUB [[ADDR]](p0), [[VAL]] :: (load store release (s32) on %ir.addr)
+  %oldval = atomicrmw sub ptr %addr, i32 1 release
+  ret i32 %oldval
 }
 
 ; Try an acq_rel atomicrmw and
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_and(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_and
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_AND [[ADDR]](p0), [[VAL]] :: (load store acq_rel (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw and ptr %addr, i256 1 acq_rel
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
-}
-
-; Try an seq_cst atomicrmw nand
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_AND [[ADDR]](p0), [[VAL]] :: (load store acq_rel (s32) on %ir.addr)
+  %oldval = atomicrmw and ptr %addr, i32 1 acq_rel
+  ret i32 %oldval
+}
+
+; Try an seq_cst atomicrmw nand. NAND isn't supported by LSE, so it
+; expands to G_ATOMIC_CMPXCHG_WITH_SUCCESS.
 define i32 @test_atomicrmw_nand(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_nand
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
+; CHECK-NEXT:  successors: %bb.2(0x80000000)
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_NAND [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw nand ptr %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[NEG1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+; CHECK-NEXT:    [[OLDVALSTART:%[0-9]+]]:_(s32) = G_LOAD [[ADDR]](p0) :: (load (s32) from %ir.addr)
+; CHECK:       bb.2.atomicrmw.start:
+; CHECK-NEXT:    successors: %bb.3({{[^)]+}}), %bb.2({{[^)]+}})
+; CHECK:         [[OLDVAL:%[0-9]+]]:_(s32) = G_PHI [[OLDVALSTART]](s32), %bb.1, [[OLDVALRES:%[0-9]+]](s32), %bb.2
+; CHECK-NEXT:    [[AND:%[0-9]+]]:_(s32) = G_AND [[OLDVAL]], [[VAL]]
+; CHECK-NEXT:    [[NEWVAL:%[0-9]+]]:_(s32) = G_XOR [[AND]], [[NEG1]]
+; CHECK:         [[OLDVALRES]]:_(s32), [[SUCCESS:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[ADDR]](p0), [[OLDVAL]], [[NEWVAL]] :: (load store seq_cst seq_cst (s32) on %ir.addr)
+; CHECK-NEXT:    G_BRCOND [[SUCCESS]](s1), %bb.3
+; CHECK-NEXT:    G_BR %bb.2
+; CHECK:       bb.3.atomicrmw.end:
+  %oldval = atomicrmw nand ptr %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw or
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_or(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_or
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_OR [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw or ptr %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_OR [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s32) on %ir.addr)
+  %oldval = atomicrmw or ptr %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw xor
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_xor(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_xor
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_XOR [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw xor ptr %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_XOR [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s32) on %ir.addr)
+  %oldval = atomicrmw xor ptr %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw min
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_min(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_min
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_MIN [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw min ptr %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_MIN [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s32) on %ir.addr)
+  %oldval = atomicrmw min ptr %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw max
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_max(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_max
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_MAX [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw max ptr %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_MAX [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s32) on %ir.addr)
+  %oldval = atomicrmw max ptr %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw unsigned min
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_umin(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_umin
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_UMIN [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw umin ptr %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_UMIN [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s32) on %ir.addr)
+  %oldval = atomicrmw umin ptr %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 ; Try an seq_cst atomicrmw unsigned max
-; AArch64 will expand some atomicrmw's at the LLVM-IR level so we use a wide type to avoid this.
 define i32 @test_atomicrmw_umax(ptr %addr) {
 ; CHECK-LABEL: name: test_atomicrmw_umax
 ; CHECK:       bb.1 (%ir-block.{{[0-9]+}}):
 ; CHECK-NEXT:  liveins: $x0
 ; CHECK:         [[ADDR:%[0-9]+]]:_(p0) = COPY $x0
-; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s256) = G_CONSTANT i256 1
-; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s256) = G_ATOMICRMW_UMAX [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s256) on %ir.addr)
-; CHECK-NEXT:    [[RES:%[0-9]+]]:_(s32) = G_TRUNC [[OLDVALRES]]
-  %oldval = atomicrmw umax ptr %addr, i256 1 seq_cst
-  ; FIXME: We currently can't lower 'ret i256' and it's not the purpose of this
-  ;        test so work around it by truncating to i32 for now.
-  %oldval.trunc = trunc i256 %oldval to i32
-  ret i32 %oldval.trunc
+; CHECK-NEXT:    [[VAL:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+; CHECK-NEXT:    [[OLDVALRES:%[0-9]+]]:_(s32) = G_ATOMICRMW_UMAX [[ADDR]](p0), [[VAL]] :: (load store seq_cst (s32) on %ir.addr)
+  %oldval = atomicrmw umax ptr %addr, i32 1 seq_cst
+  ret i32 %oldval
 }
 
 @addr = global ptr null

diff  --git a/llvm/test/CodeGen/AArch64/atomic-oversize.ll b/llvm/test/CodeGen/AArch64/atomic-oversize.ll
new file mode 100644
index 00000000000000..9065311a9aaa14
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomic-oversize.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=aarch64 < %s | FileCheck %s
+
+; Atomics larger than 128-bit are unsupported, and emit libcalls.
+define void @test(ptr %a) nounwind {
+; CHECK-LABEL: test:
+; CHECK: bl __atomic_load
+; CHECK: bl __atomic_store
+  %1 = load atomic i256, ptr %a seq_cst, align 32
+  store atomic i256 %1, ptr %a seq_cst, align 32
+  ret void
+}