[llvm] 549e118 - [PowerPC] Support 16-byte lock free atomics on pwr8 and up

Fri Apr 8 16:26:02 PDT 2022

Author: Kai Luo
Date: 2022-04-08T23:25:56Z
New Revision: 549e118e93c666914a1045fde38a2cac33e1e445

URL: https://github.com/llvm/llvm-project/commit/549e118e93c666914a1045fde38a2cac33e1e445
DIFF: https://github.com/llvm/llvm-project/commit/549e118e93c666914a1045fde38a2cac33e1e445.diff

LOG: [PowerPC] Support 16-byte lock free atomics on pwr8 and up

Make 16-byte atomic type aligned to 16-byte on PPC64, thus consistent with GCC. Also enable inlining 16-byte atomics on non-AIX targets on PPC64.

Reviewed By: hubert.reinterpretcast

Differential Revision: https://reviews.llvm.org/D122377

Added: 
    clang/test/CodeGen/PowerPC/quadword-atomics.c

Modified: 
    clang/lib/Basic/Targets/PPC.cpp
    clang/lib/Basic/Targets/PPC.h
    clang/test/CodeGen/PowerPC/atomic-alignment.c
    clang/test/Sema/atomic-ops.c
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.h
    llvm/test/CodeGen/PowerPC/atomics-i128.ll

Removed: 
    


################################################################################
diff  --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index bafcc23b38334..1f2f583b9462d 100644

--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -81,6 +81,8 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       IsISA3_0 = true;
     } else if (Feature == "+isa-v31-instructions") {
       IsISA3_1 = true;
+    } else if (Feature == "+quadword-atomics") {
+      HasQuadwordAtomics = true;
     }
     // TODO: Finish this list and add an assert that we've handled them
     // all.
@@ -550,6 +552,12 @@ bool PPCTargetInfo::initFeatureMap(
   Features["isa-v30-instructions"] =
       llvm::StringSwitch<bool>(CPU).Case("pwr9", true).Default(false);
 
+  Features["quadword-atomics"] =
+      getTriple().isArch64Bit() && llvm::StringSwitch<bool>(CPU)
+                                       .Case("pwr9", true)
+                                       .Case("pwr8", true)
+                                       .Default(false);
+
   // Power10 includes all the same features as Power9 plus any features specific
   // to the Power10 core.
   if (CPU == "pwr10" || CPU == "power10") {
@@ -660,6 +668,7 @@ bool PPCTargetInfo::hasFeature(StringRef Feature) const {
       .Case("isa-v207-instructions", IsISA2_07)
       .Case("isa-v30-instructions", IsISA3_0)
       .Case("isa-v31-instructions", IsISA3_1)
+      .Case("quadword-atomics", HasQuadwordAtomics)
       .Default(false);
 }
 

diff  --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index ac52eb219f54d..44489d06307f2 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -78,6 +78,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
   bool IsISA2_07 = false;
   bool IsISA3_0 = false;
   bool IsISA3_1 = false;
+  bool HasQuadwordAtomics = false;
 
 protected:
   std::string ABI;
@@ -439,8 +440,18 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
       DataLayout += "-S128-v256:256:256-v512:512:512";
     resetDataLayout(DataLayout);
 
-    // PPC64 supports atomics up to 8 bytes.
-    MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
+    // Newer PPC64 instruction sets support atomics up to 16 bytes.
+    MaxAtomicPromoteWidth = 128;
+    // Baseline PPC64 supports inlining atomics up to 8 bytes.
+    MaxAtomicInlineWidth = 64;
+  }
+
+  void setMaxAtomicWidth() override {
+    // For power8 and up, backend is able to inline 16-byte atomic lock free
+    // code.
+    // TODO: We should allow AIX to inline quadword atomics in the future.
+    if (!getTriple().isOSAIX() && hasFeature("quadword-atomics"))
+      MaxAtomicInlineWidth = 128;
   }
 
   BuiltinVaListKind getBuiltinVaListKind() const override {

diff  --git a/clang/test/CodeGen/PowerPC/atomic-alignment.c b/clang/test/CodeGen/PowerPC/atomic-alignment.c
index cd6985962c39e..537ba1a95c048 100644
--- a/clang/test/CodeGen/PowerPC/atomic-alignment.c
+++ b/clang/test/CodeGen/PowerPC/atomic-alignment.c
@@ -1,25 +1,30 @@
-// RUN: %clang_cc1 -verify -triple powerpc-unknown-unknown -emit-llvm -o - %s | \
+// RUN: %clang_cc1 -Werror -triple powerpc-unknown-unknown -emit-llvm -o - %s | \
 // RUN:   FileCheck %s --check-prefixes=PPC,PPC32
-// RUN: %clang_cc1 -verify -triple powerpc64le-unknown-linux -emit-llvm -o - %s | \
-// RUN:   FileCheck %s --check-prefixes=PPC,PPC64
-// RUN: %clang_cc1 -verify -triple powerpc64-unknown-aix -emit-llvm -o - %s | \
+// RUN: %clang_cc1 -Werror -triple powerpc64le-unknown-linux -emit-llvm -o - %s | \
 // RUN:   FileCheck %s --check-prefixes=PPC,PPC64
+// RUN: %clang_cc1 -Werror -triple powerpc64le-unknown-linux -emit-llvm -o - %s \
+// RUN:   -target-cpu pwr8 | FileCheck %s --check-prefixes=PPC,PPC64
+// RUN: %clang_cc1 -Werror -triple powerpc64-unknown-aix -emit-llvm -o - %s | \
+// RUN:   FileCheck %s --check-prefixes=PPC,AIX64
+// RUN: %clang_cc1 -Werror -triple powerpc64-unknown-aix -emit-llvm -o - %s \
+// RUN:   -target-cpu pwr8 | FileCheck %s --check-prefixes=PPC,AIX64
 
 // PPC: @c = global i8 0, align 1{{$}}
-_Atomic(char) c; // expected-no-diagnostics
+_Atomic(char) c;
 
 // PPC: @s = global i16 0, align 2{{$}}
-_Atomic(short) s; // expected-no-diagnostics
+_Atomic(short) s;
 
 // PPC: @i = global i32 0, align 4{{$}}
-_Atomic(int) i; // expected-no-diagnostics
+_Atomic(int) i;
 
 // PPC32: @l = global i32 0, align 4{{$}}
 // PPC64: @l = global i64 0, align 8{{$}}
-_Atomic(long) l; // expected-no-diagnostics
+// AIX64: @l = global i64 0, align 8{{$}}
+_Atomic(long) l;
 
 // PPC: @ll = global i64 0, align 8{{$}}
-_Atomic(long long) ll; // expected-no-diagnostics
+_Atomic(long long) ll;
 
 typedef struct {
   char x[8];
@@ -27,11 +32,14 @@ typedef struct {
 
 // PPC32: @o = global %struct.O zeroinitializer, align 1{{$}}
 // PPC64: @o = global %struct.O zeroinitializer, align 8{{$}}
-_Atomic(O) o; // expected-no-diagnostics
+// AIX64: @o = global %struct.O zeroinitializer, align 8{{$}}
+_Atomic(O) o;
 
 typedef struct {
   char x[16];
 } Q;
 
-// PPC: @q = global %struct.Q zeroinitializer, align 1{{$}}
-_Atomic(Q) q; // expected-no-diagnostics
+// PPC32: @q = global %struct.Q zeroinitializer, align 1{{$}}
+// PPC64: @q = global %struct.Q zeroinitializer, align 16{{$}}
+// AIX64: @q = global %struct.Q zeroinitializer, align 16{{$}}
+_Atomic(Q) q;

diff  --git a/clang/test/CodeGen/PowerPC/quadword-atomics.c b/clang/test/CodeGen/PowerPC/quadword-atomics.c
new file mode 100644
index 0000000000000..b1da89c2785b7
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/quadword-atomics.c
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64le-linux-gnu \
+// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64-PWR8
+// RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64le-linux-gnu \
+// RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
+// RUN:   -target-cpu pwr7 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+
+typedef struct {
+  char x[16];
+} Q;
+
+typedef _Atomic(Q) AtomicQ;
+
+typedef __int128_t int128_t;
+
+// PPC64-PWR8-LABEL: @test_load(
+// PPC64-PWR8:    [[TMP3:%.*]] = load atomic i128, i128* [[TMP1:%.*]] acquire, align 16
+//
+// PPC64-LABEL: @test_load(
+// PPC64:    call void @__atomic_load(i64 noundef 16, i8* noundef [[TMP3:%.*]], i8* noundef [[TMP4:%.*]], i32 noundef signext 2)
+//
+Q test_load(AtomicQ *ptr) {
+  // expected-no-diagnostics
+  return __c11_atomic_load(ptr, __ATOMIC_ACQUIRE);
+}
+
+// PPC64-PWR8-LABEL: @test_store(
+// PPC64-PWR8:    store atomic i128 [[TMP6:%.*]], i128* [[TMP4:%.*]] release, align 16
+//
+// PPC64-LABEL: @test_store(
+// PPC64:    call void @__atomic_store(i64 noundef 16, i8* noundef [[TMP6:%.*]], i8* noundef [[TMP7:%.*]], i32 noundef signext 3)
+//
+void test_store(Q val, AtomicQ *ptr) {
+  // expected-no-diagnostics
+  __c11_atomic_store(ptr, val, __ATOMIC_RELEASE);
+}
+
+// PPC64-PWR8-LABEL: @test_add(
+// PPC64-PWR8:    [[TMP3:%.*]] = atomicrmw add i128* [[TMP0:%.*]], i128 [[TMP2:%.*]] monotonic, align 16
+//
+// PPC64-LABEL: @test_add(
+// PPC64:    [[CALL:%.*]] = call i128 @__atomic_fetch_add_16(i8* noundef [[TMP2:%.*]], i128 noundef [[TMP3:%.*]], i32 noundef signext 0)
+//
+void test_add(_Atomic(int128_t) *ptr, int128_t x) {
+  // expected-no-diagnostics
+  __c11_atomic_fetch_add(ptr, x, __ATOMIC_RELAXED);
+}
+
+// PPC64-PWR8-LABEL: @test_xchg(
+// PPC64-PWR8:    [[TMP8:%.*]] = atomicrmw xchg i128* [[TMP4:%.*]], i128 [[TMP7:%.*]] seq_cst, align 16
+//
+// PPC64-LABEL: @test_xchg(
+// PPC64:    call void @__atomic_exchange(i64 noundef 16, i8* noundef [[TMP7:%.*]], i8* noundef [[TMP8:%.*]], i8* noundef [[TMP9:%.*]], i32 noundef signext 5)
+//
+Q test_xchg(AtomicQ *ptr, Q new) {
+  // expected-no-diagnostics
+  return __c11_atomic_exchange(ptr, new, __ATOMIC_SEQ_CST);
+}
+
+// PPC64-PWR8-LABEL: @test_cmpxchg(
+// PPC64-PWR8:    [[TMP10:%.*]] = cmpxchg i128* [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
+//
+// PPC64-LABEL: @test_cmpxchg(
+// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, i8* noundef [[TMP8:%.*]], i8* noundef [[TMP9:%.*]], i8* noundef [[TMP10:%.*]], i32 noundef signext 5, i32 noundef signext 0)
+//
+int test_cmpxchg(AtomicQ *ptr, Q *cmp, Q new) {
+  // expected-no-diagnostics
+  return __c11_atomic_compare_exchange_strong(ptr, cmp, new, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+}
+
+// PPC64-PWR8-LABEL: @test_cmpxchg_weak(
+// PPC64-PWR8:    [[TMP10:%.*]] = cmpxchg weak i128* [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
+//
+// PPC64-LABEL: @test_cmpxchg_weak(
+// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, i8* noundef [[TMP8:%.*]], i8* noundef [[TMP9:%.*]], i8* noundef [[TMP10:%.*]], i32 noundef signext 5, i32 noundef signext 0)
+//
+int test_cmpxchg_weak(AtomicQ *ptr, Q *cmp, Q new) {
+  // expected-no-diagnostics
+  return __c11_atomic_compare_exchange_weak(ptr, cmp, new, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+}
+
+// PPC64-PWR8-LABEL: @is_lock_free(
+// PPC64-PWR8:    ret i32 1
+//
+// PPC64-LABEL: @is_lock_free(
+// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_is_lock_free(i64 noundef 16, i8* noundef null)
+//
+int is_lock_free() {
+  AtomicQ q;
+ // expected-no-diagnostics
+  return __c11_atomic_is_lock_free(sizeof(q));
+}

diff  --git a/clang/test/Sema/atomic-ops.c b/clang/test/Sema/atomic-ops.c
index a3c156d6663b9..3ad469d337433 100644
--- a/clang/test/Sema/atomic-ops.c
+++ b/clang/test/Sema/atomic-ops.c
@@ -9,7 +9,7 @@
 // RUN:   -target-cpu pwr7
 // RUN: %clang_cc1 %s -verify -fgnuc-version=4.2.1 -ffreestanding \
 // RUN:   -fsyntax-only -triple=powerpc64le-linux-gnu -std=c11 \
-// RUN:   -target-cpu pwr8
+// RUN:   -target-cpu pwr8 -DPPC64_PWR8
 
 // Basic parsing/Sema tests for __c11_atomic_*
 
@@ -47,7 +47,11 @@ _Static_assert(__c11_atomic_is_lock_free(2), "");
 _Static_assert(__c11_atomic_is_lock_free(3), ""); // expected-error {{not an integral constant expression}}
 _Static_assert(__c11_atomic_is_lock_free(4), "");
 _Static_assert(__c11_atomic_is_lock_free(8), "");
+#ifndef PPC64_PWR8
 _Static_assert(__c11_atomic_is_lock_free(16), ""); // expected-error {{not an integral constant expression}}
+#else
+_Static_assert(__c11_atomic_is_lock_free(16), ""); // expected-no-error
+#endif
 _Static_assert(__c11_atomic_is_lock_free(17), ""); // expected-error {{not an integral constant expression}}
 
 _Static_assert(__atomic_is_lock_free(1, 0), "");
@@ -55,15 +59,23 @@ _Static_assert(__atomic_is_lock_free(2, 0), "");
 _Static_assert(__atomic_is_lock_free(3, 0), ""); // expected-error {{not an integral constant expression}}
 _Static_assert(__atomic_is_lock_free(4, 0), "");
 _Static_assert(__atomic_is_lock_free(8, 0), "");
+#ifndef PPC64_PWR8
 _Static_assert(__atomic_is_lock_free(16, 0), ""); // expected-error {{not an integral constant expression}}
+#else
+_Static_assert(__atomic_is_lock_free(16, 0), ""); // expected-no-error
+#endif
 _Static_assert(__atomic_is_lock_free(17, 0), ""); // expected-error {{not an integral constant expression}}
 
 _Static_assert(atomic_is_lock_free((atomic_char*)0), "");
 _Static_assert(atomic_is_lock_free((atomic_short*)0), "");
 _Static_assert(atomic_is_lock_free((atomic_int*)0), "");
 _Static_assert(atomic_is_lock_free((atomic_long*)0), "");
+#ifndef PPC64_PWR8
 // noi128-error at +1 {{__int128 is not supported on this target}}
 _Static_assert(atomic_is_lock_free((_Atomic(__int128)*)0), ""); // expected-error {{not an integral constant expression}}
+#else
+_Static_assert(atomic_is_lock_free((_Atomic(__int128)*)0), ""); // expected-no-error
+#endif
 _Static_assert(atomic_is_lock_free(0 + (atomic_char*)0), "");
 
 char i8;
@@ -88,7 +100,11 @@ _Static_assert(__atomic_always_lock_free(2, 0), "");
 _Static_assert(!__atomic_always_lock_free(3, 0), "");
 _Static_assert(__atomic_always_lock_free(4, 0), "");
 _Static_assert(__atomic_always_lock_free(8, 0), "");
+#ifndef PPC64_PWR8
 _Static_assert(!__atomic_always_lock_free(16, 0), "");
+#else
+_Static_assert(__atomic_always_lock_free(16, 0), "");
+#endif
 _Static_assert(!__atomic_always_lock_free(17, 0), "");
 
 _Static_assert(__atomic_always_lock_free(1, incomplete), "");

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index b126ed486b0d6..3a04faf3e685a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1321,7 +1321,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
   }
 
-  if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics()) {
+  if (shouldInlineQuadwordAtomics()) {
     setMaxAtomicSizeInBitsSupported(128);
     setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
     setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
@@ -18053,10 +18053,18 @@ CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,
   }
 }
 
+bool PPCTargetLowering::shouldInlineQuadwordAtomics() const {
+  // TODO: 16-byte atomic type support for AIX is in progress; we should be able
+  // to inline 16-byte atomic ops on AIX too in the future.
+  return Subtarget.isPPC64() &&
+         (EnableQuadwordAtomics || !Subtarget.getTargetTriple().isOSAIX()) &&
+         Subtarget.hasQuadwordAtomics();
+}
+
 TargetLowering::AtomicExpansionKind
 PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128)
+  if (shouldInlineQuadwordAtomics() && Size == 128)
     return AtomicExpansionKind::MaskedIntrinsic;
   return TargetLowering::shouldExpandAtomicRMWInIR(AI);
 }
@@ -18064,7 +18072,7 @@ PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 TargetLowering::AtomicExpansionKind
 PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
-  if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128)
+  if (shouldInlineQuadwordAtomics() && Size == 128)
     return AtomicExpansionKind::MaskedIntrinsic;
   return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI);
 }
@@ -18094,8 +18102,7 @@ getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
 Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
     IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
     Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
-  assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
-         "Only support quadword now");
+  assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = Incr->getType();
   assert(ValTy->getPrimitiveSizeInBits() == 128);
@@ -18119,8 +18126,7 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
 Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
     IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
     Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
-  assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
-         "Only support quadword now");
+  assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = CmpVal->getType();
   assert(ValTy->getPrimitiveSizeInBits() == 128);

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 826d26ce85631..f92a117fe27fd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -910,6 +910,8 @@ namespace llvm {
     Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
                                    AtomicOrdering Ord) const override;
 
+    bool shouldInlineQuadwordAtomics() const;
+
     TargetLowering::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 

diff  --git a/llvm/test/CodeGen/PowerPC/atomics-i128.ll b/llvm/test/CodeGen/PowerPC/atomics-i128.ll
index 95a2eb5df45ec..62b69aed56b1c 100644
--- a/llvm/test/CodeGen/PowerPC/atomics-i128.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics-i128.ll
@@ -5,6 +5,22 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown -mcpu=pwr7 \
 ; RUN:   -ppc-asm-full-reg-names -ppc-quadword-atomics \
 ; RUN:   -ppc-track-subreg-liveness < %s | FileCheck --check-prefix=PWR7 %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 \
+; RUN:   -ppc-asm-full-reg-names -ppc-track-subreg-liveness < %s | FileCheck \
+; RUN:   --check-prefix=LE-PWR8 %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-freebsd -mcpu=pwr8 \
+; RUN:   -ppc-asm-full-reg-names -ppc-track-subreg-liveness < %s | FileCheck \
+; RUN:   --check-prefix=LE-PWR8 %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-aix -mcpu=pwr8 \
+; RUN:   -ppc-asm-full-reg-names -ppc-track-subreg-liveness < %s | FileCheck \
+; RUN:   --check-prefix=AIX64-PWR8 %s
+
+; On 32-bit PPC platform, 16-byte lock free atomic instructions are not available,
+; it's expected not to generate inlined lock-free code on such platforms, even arch level
+; is pwr8+ and `-ppc-quadword-atomics` is on.
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-unknown -mcpu=pwr8 \
+; RUN:   -ppc-quadword-atomics -ppc-asm-full-reg-names -ppc-track-subreg-liveness < %s \
+; RUN: | FileCheck --check-prefix=PPC-PWR8 %s
 
 
 define i128 @swap(i128* %a, i128 %x) {
@@ -39,6 +55,62 @@ define i128 @swap(i128* %a, i128 %x) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: swap:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    sync
+; LE-PWR8-NEXT:  .LBB0_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r6, 0, r3
+; LE-PWR8-NEXT:    mr r9, r4
+; LE-PWR8-NEXT:    mr r8, r5
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB0_1
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r7
+; LE-PWR8-NEXT:    mr r4, r6
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: swap:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    sync
+; AIX64-PWR8-NEXT:    bl .__sync_lock_test_and_set_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: swap:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -48(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 48
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    mr r4, r3
+; PPC-PWR8-NEXT:    stw r7, 40(r1)
+; PPC-PWR8-NEXT:    stw r6, 36(r1)
+; PPC-PWR8-NEXT:    addi r6, r1, 16
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    li r7, 5
+; PPC-PWR8-NEXT:    stw r5, 32(r1)
+; PPC-PWR8-NEXT:    addi r5, r1, 32
+; PPC-PWR8-NEXT:    stw r8, 44(r1)
+; PPC-PWR8-NEXT:    bl __atomic_exchange
+; PPC-PWR8-NEXT:    lwz r6, 28(r1)
+; PPC-PWR8-NEXT:    lwz r5, 24(r1)
+; PPC-PWR8-NEXT:    lwz r4, 20(r1)
+; PPC-PWR8-NEXT:    lwz r3, 16(r1)
+; PPC-PWR8-NEXT:    lwz r0, 52(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 48
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = atomicrmw xchg i128* %a, i128 %x seq_cst, align 16
   ret i128 %0
@@ -76,6 +148,109 @@ define i128 @add(i128* %a, i128 %x) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: add:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    sync
+; LE-PWR8-NEXT:  .LBB1_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r6, 0, r3
+; LE-PWR8-NEXT:    addc r9, r4, r7
+; LE-PWR8-NEXT:    adde r8, r5, r6
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB1_1
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r7
+; LE-PWR8-NEXT:    mr r4, r6
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: add:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    sync
+; AIX64-PWR8-NEXT:    bl .__sync_fetch_and_add_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: add:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -80(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 80
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    .cfi_offset r24, -32
+; PPC-PWR8-NEXT:    .cfi_offset r25, -28
+; PPC-PWR8-NEXT:    .cfi_offset r26, -24
+; PPC-PWR8-NEXT:    .cfi_offset r27, -20
+; PPC-PWR8-NEXT:    .cfi_offset r28, -16
+; PPC-PWR8-NEXT:    .cfi_offset r29, -12
+; PPC-PWR8-NEXT:    .cfi_offset r30, -8
+; PPC-PWR8-NEXT:    stw r26, 56(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r27, 60(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r27, r5
+; PPC-PWR8-NEXT:    mr r26, r3
+; PPC-PWR8-NEXT:    lwz r5, 8(r3)
+; PPC-PWR8-NEXT:    lwz r4, 4(r3)
+; PPC-PWR8-NEXT:    stw r28, 64(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r28, r6
+; PPC-PWR8-NEXT:    lwz r6, 12(r3)
+; PPC-PWR8-NEXT:    lwz r3, 0(r3)
+; PPC-PWR8-NEXT:    stw r24, 48(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    addi r24, r1, 16
+; PPC-PWR8-NEXT:    stw r25, 52(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r29, 68(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r29, r7
+; PPC-PWR8-NEXT:    addi r25, r1, 32
+; PPC-PWR8-NEXT:    stw r30, 72(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r30, r8
+; PPC-PWR8-NEXT:    .p2align 4
+; PPC-PWR8-NEXT:  .LBB1_1: # %atomicrmw.start
+; PPC-PWR8-NEXT:    #
+; PPC-PWR8-NEXT:    addc r7, r6, r30
+; PPC-PWR8-NEXT:    stw r4, 36(r1)
+; PPC-PWR8-NEXT:    stw r3, 32(r1)
+; PPC-PWR8-NEXT:    adde r8, r5, r29
+; PPC-PWR8-NEXT:    stw r5, 40(r1)
+; PPC-PWR8-NEXT:    stw r6, 44(r1)
+; PPC-PWR8-NEXT:    mr r5, r25
+; PPC-PWR8-NEXT:    adde r4, r4, r28
+; PPC-PWR8-NEXT:    stw r7, 28(r1)
+; PPC-PWR8-NEXT:    stw r8, 24(r1)
+; PPC-PWR8-NEXT:    mr r6, r24
+; PPC-PWR8-NEXT:    adde r3, r3, r27
+; PPC-PWR8-NEXT:    stw r4, 20(r1)
+; PPC-PWR8-NEXT:    mr r4, r26
+; PPC-PWR8-NEXT:    li r7, 5
+; PPC-PWR8-NEXT:    stw r3, 16(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    li r8, 5
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    cmplwi r3, 0
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    beq cr0, .LBB1_1
+; PPC-PWR8-NEXT:  # %bb.2: # %atomicrmw.end
+; PPC-PWR8-NEXT:    lwz r30, 72(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r29, 68(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r28, 64(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r27, 60(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r26, 56(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r25, 52(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r24, 48(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r0, 84(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 80
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = atomicrmw add i128* %a, i128 %x seq_cst, align 16
   ret i128 %0
@@ -113,6 +288,109 @@ define i128 @sub(i128* %a, i128 %x) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: sub:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    sync
+; LE-PWR8-NEXT:  .LBB2_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r6, 0, r3
+; LE-PWR8-NEXT:    subc r9, r7, r4
+; LE-PWR8-NEXT:    subfe r8, r5, r6
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB2_1
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r7
+; LE-PWR8-NEXT:    mr r4, r6
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: sub:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    sync
+; AIX64-PWR8-NEXT:    bl .__sync_fetch_and_sub_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: sub:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -80(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 80
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    .cfi_offset r24, -32
+; PPC-PWR8-NEXT:    .cfi_offset r25, -28
+; PPC-PWR8-NEXT:    .cfi_offset r26, -24
+; PPC-PWR8-NEXT:    .cfi_offset r27, -20
+; PPC-PWR8-NEXT:    .cfi_offset r28, -16
+; PPC-PWR8-NEXT:    .cfi_offset r29, -12
+; PPC-PWR8-NEXT:    .cfi_offset r30, -8
+; PPC-PWR8-NEXT:    stw r26, 56(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r27, 60(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r27, r5
+; PPC-PWR8-NEXT:    mr r26, r3
+; PPC-PWR8-NEXT:    lwz r5, 8(r3)
+; PPC-PWR8-NEXT:    lwz r4, 4(r3)
+; PPC-PWR8-NEXT:    stw r28, 64(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r28, r6
+; PPC-PWR8-NEXT:    lwz r6, 12(r3)
+; PPC-PWR8-NEXT:    lwz r3, 0(r3)
+; PPC-PWR8-NEXT:    stw r24, 48(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    addi r24, r1, 16
+; PPC-PWR8-NEXT:    stw r25, 52(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r29, 68(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r29, r7
+; PPC-PWR8-NEXT:    addi r25, r1, 32
+; PPC-PWR8-NEXT:    stw r30, 72(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r30, r8
+; PPC-PWR8-NEXT:    .p2align 4
+; PPC-PWR8-NEXT:  .LBB2_1: # %atomicrmw.start
+; PPC-PWR8-NEXT:    #
+; PPC-PWR8-NEXT:    subc r7, r6, r30
+; PPC-PWR8-NEXT:    stw r4, 36(r1)
+; PPC-PWR8-NEXT:    stw r3, 32(r1)
+; PPC-PWR8-NEXT:    subfe r8, r29, r5
+; PPC-PWR8-NEXT:    stw r5, 40(r1)
+; PPC-PWR8-NEXT:    stw r6, 44(r1)
+; PPC-PWR8-NEXT:    mr r5, r25
+; PPC-PWR8-NEXT:    subfe r4, r28, r4
+; PPC-PWR8-NEXT:    stw r7, 28(r1)
+; PPC-PWR8-NEXT:    stw r8, 24(r1)
+; PPC-PWR8-NEXT:    mr r6, r24
+; PPC-PWR8-NEXT:    subfe r3, r27, r3
+; PPC-PWR8-NEXT:    stw r4, 20(r1)
+; PPC-PWR8-NEXT:    mr r4, r26
+; PPC-PWR8-NEXT:    li r7, 5
+; PPC-PWR8-NEXT:    stw r3, 16(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    li r8, 5
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    cmplwi r3, 0
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    beq cr0, .LBB2_1
+; PPC-PWR8-NEXT:  # %bb.2: # %atomicrmw.end
+; PPC-PWR8-NEXT:    lwz r30, 72(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r29, 68(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r28, 64(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r27, 60(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r26, 56(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r25, 52(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r24, 48(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r0, 84(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 80
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = atomicrmw sub i128* %a, i128 %x seq_cst, align 16
   ret i128 %0
@@ -150,6 +428,109 @@ define i128 @and(i128* %a, i128 %x) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: and:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    sync
+; LE-PWR8-NEXT:  .LBB3_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r6, 0, r3
+; LE-PWR8-NEXT:    and r9, r4, r7
+; LE-PWR8-NEXT:    and r8, r5, r6
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB3_1
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r7
+; LE-PWR8-NEXT:    mr r4, r6
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: and:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    sync
+; AIX64-PWR8-NEXT:    bl .__sync_fetch_and_and_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: and:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -80(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 80
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    .cfi_offset r24, -32
+; PPC-PWR8-NEXT:    .cfi_offset r25, -28
+; PPC-PWR8-NEXT:    .cfi_offset r26, -24
+; PPC-PWR8-NEXT:    .cfi_offset r27, -20
+; PPC-PWR8-NEXT:    .cfi_offset r28, -16
+; PPC-PWR8-NEXT:    .cfi_offset r29, -12
+; PPC-PWR8-NEXT:    .cfi_offset r30, -8
+; PPC-PWR8-NEXT:    stw r26, 56(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r27, 60(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r27, r5
+; PPC-PWR8-NEXT:    mr r26, r3
+; PPC-PWR8-NEXT:    lwz r5, 8(r3)
+; PPC-PWR8-NEXT:    lwz r4, 4(r3)
+; PPC-PWR8-NEXT:    stw r28, 64(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r28, r6
+; PPC-PWR8-NEXT:    lwz r6, 12(r3)
+; PPC-PWR8-NEXT:    lwz r3, 0(r3)
+; PPC-PWR8-NEXT:    stw r24, 48(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    addi r24, r1, 16
+; PPC-PWR8-NEXT:    stw r25, 52(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r29, 68(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r29, r7
+; PPC-PWR8-NEXT:    addi r25, r1, 32
+; PPC-PWR8-NEXT:    stw r30, 72(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r30, r8
+; PPC-PWR8-NEXT:    .p2align 4
+; PPC-PWR8-NEXT:  .LBB3_1: # %atomicrmw.start
+; PPC-PWR8-NEXT:    #
+; PPC-PWR8-NEXT:    stw r3, 32(r1)
+; PPC-PWR8-NEXT:    stw r4, 36(r1)
+; PPC-PWR8-NEXT:    and r7, r5, r29
+; PPC-PWR8-NEXT:    and r8, r6, r30
+; PPC-PWR8-NEXT:    and r3, r3, r27
+; PPC-PWR8-NEXT:    and r4, r4, r28
+; PPC-PWR8-NEXT:    stw r5, 40(r1)
+; PPC-PWR8-NEXT:    stw r6, 44(r1)
+; PPC-PWR8-NEXT:    mr r5, r25
+; PPC-PWR8-NEXT:    mr r6, r24
+; PPC-PWR8-NEXT:    stw r8, 28(r1)
+; PPC-PWR8-NEXT:    stw r7, 24(r1)
+; PPC-PWR8-NEXT:    li r7, 5
+; PPC-PWR8-NEXT:    li r8, 5
+; PPC-PWR8-NEXT:    stw r4, 20(r1)
+; PPC-PWR8-NEXT:    stw r3, 16(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    mr r4, r26
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    cmplwi r3, 0
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    beq cr0, .LBB3_1
+; PPC-PWR8-NEXT:  # %bb.2: # %atomicrmw.end
+; PPC-PWR8-NEXT:    lwz r30, 72(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r29, 68(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r28, 64(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r27, 60(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r26, 56(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r25, 52(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r24, 48(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r0, 84(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 80
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = atomicrmw and i128* %a, i128 %x seq_cst, align 16
   ret i128 %0
@@ -187,6 +568,109 @@ define i128 @or(i128* %a, i128 %x) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: or:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    sync
+; LE-PWR8-NEXT:  .LBB4_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r6, 0, r3
+; LE-PWR8-NEXT:    or r9, r4, r7
+; LE-PWR8-NEXT:    or r8, r5, r6
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB4_1
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r7
+; LE-PWR8-NEXT:    mr r4, r6
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: or:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    sync
+; AIX64-PWR8-NEXT:    bl .__sync_fetch_and_or_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: or:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -80(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 80
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    .cfi_offset r24, -32
+; PPC-PWR8-NEXT:    .cfi_offset r25, -28
+; PPC-PWR8-NEXT:    .cfi_offset r26, -24
+; PPC-PWR8-NEXT:    .cfi_offset r27, -20
+; PPC-PWR8-NEXT:    .cfi_offset r28, -16
+; PPC-PWR8-NEXT:    .cfi_offset r29, -12
+; PPC-PWR8-NEXT:    .cfi_offset r30, -8
+; PPC-PWR8-NEXT:    stw r26, 56(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r27, 60(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r27, r5
+; PPC-PWR8-NEXT:    mr r26, r3
+; PPC-PWR8-NEXT:    lwz r5, 8(r3)
+; PPC-PWR8-NEXT:    lwz r4, 4(r3)
+; PPC-PWR8-NEXT:    stw r28, 64(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r28, r6
+; PPC-PWR8-NEXT:    lwz r6, 12(r3)
+; PPC-PWR8-NEXT:    lwz r3, 0(r3)
+; PPC-PWR8-NEXT:    stw r24, 48(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    addi r24, r1, 16
+; PPC-PWR8-NEXT:    stw r25, 52(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r29, 68(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r29, r7
+; PPC-PWR8-NEXT:    addi r25, r1, 32
+; PPC-PWR8-NEXT:    stw r30, 72(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r30, r8
+; PPC-PWR8-NEXT:    .p2align 4
+; PPC-PWR8-NEXT:  .LBB4_1: # %atomicrmw.start
+; PPC-PWR8-NEXT:    #
+; PPC-PWR8-NEXT:    stw r3, 32(r1)
+; PPC-PWR8-NEXT:    stw r4, 36(r1)
+; PPC-PWR8-NEXT:    or r7, r5, r29
+; PPC-PWR8-NEXT:    or r8, r6, r30
+; PPC-PWR8-NEXT:    or r3, r3, r27
+; PPC-PWR8-NEXT:    or r4, r4, r28
+; PPC-PWR8-NEXT:    stw r5, 40(r1)
+; PPC-PWR8-NEXT:    stw r6, 44(r1)
+; PPC-PWR8-NEXT:    mr r5, r25
+; PPC-PWR8-NEXT:    mr r6, r24
+; PPC-PWR8-NEXT:    stw r8, 28(r1)
+; PPC-PWR8-NEXT:    stw r7, 24(r1)
+; PPC-PWR8-NEXT:    li r7, 5
+; PPC-PWR8-NEXT:    li r8, 5
+; PPC-PWR8-NEXT:    stw r4, 20(r1)
+; PPC-PWR8-NEXT:    stw r3, 16(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    mr r4, r26
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    cmplwi r3, 0
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    beq cr0, .LBB4_1
+; PPC-PWR8-NEXT:  # %bb.2: # %atomicrmw.end
+; PPC-PWR8-NEXT:    lwz r30, 72(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r29, 68(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r28, 64(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r27, 60(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r26, 56(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r25, 52(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r24, 48(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r0, 84(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 80
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = atomicrmw or i128* %a, i128 %x seq_cst, align 16
   ret i128 %0
@@ -224,6 +708,109 @@ define i128 @xor(i128* %a, i128 %x) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: xor:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    sync
+; LE-PWR8-NEXT:  .LBB5_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r6, 0, r3
+; LE-PWR8-NEXT:    xor r9, r4, r7
+; LE-PWR8-NEXT:    xor r8, r5, r6
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB5_1
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r7
+; LE-PWR8-NEXT:    mr r4, r6
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: xor:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    sync
+; AIX64-PWR8-NEXT:    bl .__sync_fetch_and_xor_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: xor:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -80(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 80
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    .cfi_offset r24, -32
+; PPC-PWR8-NEXT:    .cfi_offset r25, -28
+; PPC-PWR8-NEXT:    .cfi_offset r26, -24
+; PPC-PWR8-NEXT:    .cfi_offset r27, -20
+; PPC-PWR8-NEXT:    .cfi_offset r28, -16
+; PPC-PWR8-NEXT:    .cfi_offset r29, -12
+; PPC-PWR8-NEXT:    .cfi_offset r30, -8
+; PPC-PWR8-NEXT:    stw r26, 56(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r27, 60(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r27, r5
+; PPC-PWR8-NEXT:    mr r26, r3
+; PPC-PWR8-NEXT:    lwz r5, 8(r3)
+; PPC-PWR8-NEXT:    lwz r4, 4(r3)
+; PPC-PWR8-NEXT:    stw r28, 64(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r28, r6
+; PPC-PWR8-NEXT:    lwz r6, 12(r3)
+; PPC-PWR8-NEXT:    lwz r3, 0(r3)
+; PPC-PWR8-NEXT:    stw r24, 48(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    addi r24, r1, 16
+; PPC-PWR8-NEXT:    stw r25, 52(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r29, 68(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r29, r7
+; PPC-PWR8-NEXT:    addi r25, r1, 32
+; PPC-PWR8-NEXT:    stw r30, 72(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r30, r8
+; PPC-PWR8-NEXT:    .p2align 4
+; PPC-PWR8-NEXT:  .LBB5_1: # %atomicrmw.start
+; PPC-PWR8-NEXT:    #
+; PPC-PWR8-NEXT:    stw r3, 32(r1)
+; PPC-PWR8-NEXT:    stw r4, 36(r1)
+; PPC-PWR8-NEXT:    xor r7, r5, r29
+; PPC-PWR8-NEXT:    xor r8, r6, r30
+; PPC-PWR8-NEXT:    xor r3, r3, r27
+; PPC-PWR8-NEXT:    xor r4, r4, r28
+; PPC-PWR8-NEXT:    stw r5, 40(r1)
+; PPC-PWR8-NEXT:    stw r6, 44(r1)
+; PPC-PWR8-NEXT:    mr r5, r25
+; PPC-PWR8-NEXT:    mr r6, r24
+; PPC-PWR8-NEXT:    stw r8, 28(r1)
+; PPC-PWR8-NEXT:    stw r7, 24(r1)
+; PPC-PWR8-NEXT:    li r7, 5
+; PPC-PWR8-NEXT:    li r8, 5
+; PPC-PWR8-NEXT:    stw r4, 20(r1)
+; PPC-PWR8-NEXT:    stw r3, 16(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    mr r4, r26
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    cmplwi r3, 0
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    beq cr0, .LBB5_1
+; PPC-PWR8-NEXT:  # %bb.2: # %atomicrmw.end
+; PPC-PWR8-NEXT:    lwz r30, 72(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r29, 68(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r28, 64(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r27, 60(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r26, 56(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r25, 52(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r24, 48(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r0, 84(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 80
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = atomicrmw xor i128* %a, i128 %x seq_cst, align 16
   ret i128 %0
@@ -261,6 +848,109 @@ define i128 @nand(i128* %a, i128 %x) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: nand:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    sync
+; LE-PWR8-NEXT:  .LBB6_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r6, 0, r3
+; LE-PWR8-NEXT:    nand r9, r4, r7
+; LE-PWR8-NEXT:    nand r8, r5, r6
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB6_1
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r7
+; LE-PWR8-NEXT:    mr r4, r6
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: nand:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    sync
+; AIX64-PWR8-NEXT:    bl .__sync_fetch_and_nand_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: nand:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -80(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 80
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    .cfi_offset r24, -32
+; PPC-PWR8-NEXT:    .cfi_offset r25, -28
+; PPC-PWR8-NEXT:    .cfi_offset r26, -24
+; PPC-PWR8-NEXT:    .cfi_offset r27, -20
+; PPC-PWR8-NEXT:    .cfi_offset r28, -16
+; PPC-PWR8-NEXT:    .cfi_offset r29, -12
+; PPC-PWR8-NEXT:    .cfi_offset r30, -8
+; PPC-PWR8-NEXT:    stw r26, 56(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r27, 60(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r27, r5
+; PPC-PWR8-NEXT:    mr r26, r3
+; PPC-PWR8-NEXT:    lwz r5, 8(r3)
+; PPC-PWR8-NEXT:    lwz r4, 4(r3)
+; PPC-PWR8-NEXT:    stw r28, 64(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r28, r6
+; PPC-PWR8-NEXT:    lwz r6, 12(r3)
+; PPC-PWR8-NEXT:    lwz r3, 0(r3)
+; PPC-PWR8-NEXT:    stw r24, 48(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    addi r24, r1, 16
+; PPC-PWR8-NEXT:    stw r25, 52(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r29, 68(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r29, r7
+; PPC-PWR8-NEXT:    addi r25, r1, 32
+; PPC-PWR8-NEXT:    stw r30, 72(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r30, r8
+; PPC-PWR8-NEXT:    .p2align 4
+; PPC-PWR8-NEXT:  .LBB6_1: # %atomicrmw.start
+; PPC-PWR8-NEXT:    #
+; PPC-PWR8-NEXT:    stw r3, 32(r1)
+; PPC-PWR8-NEXT:    stw r4, 36(r1)
+; PPC-PWR8-NEXT:    nand r7, r5, r29
+; PPC-PWR8-NEXT:    nand r8, r6, r30
+; PPC-PWR8-NEXT:    nand r3, r3, r27
+; PPC-PWR8-NEXT:    nand r4, r4, r28
+; PPC-PWR8-NEXT:    stw r5, 40(r1)
+; PPC-PWR8-NEXT:    stw r6, 44(r1)
+; PPC-PWR8-NEXT:    mr r5, r25
+; PPC-PWR8-NEXT:    mr r6, r24
+; PPC-PWR8-NEXT:    stw r8, 28(r1)
+; PPC-PWR8-NEXT:    stw r7, 24(r1)
+; PPC-PWR8-NEXT:    li r7, 5
+; PPC-PWR8-NEXT:    li r8, 5
+; PPC-PWR8-NEXT:    stw r4, 20(r1)
+; PPC-PWR8-NEXT:    stw r3, 16(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    mr r4, r26
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    cmplwi r3, 0
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    beq cr0, .LBB6_1
+; PPC-PWR8-NEXT:  # %bb.2: # %atomicrmw.end
+; PPC-PWR8-NEXT:    lwz r30, 72(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r29, 68(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r28, 64(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r27, 60(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r26, 56(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r25, 52(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r24, 48(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r0, 84(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 80
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = atomicrmw nand i128* %a, i128 %x seq_cst, align 16
   ret i128 %0
@@ -306,6 +996,76 @@ define i128 @cas_weak_acquire_acquire(i128* %a, i128 %cmp, i128 %new) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: cas_weak_acquire_acquire:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:  .LBB7_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r8, 0, r3
+; LE-PWR8-NEXT:    xor r11, r9, r4
+; LE-PWR8-NEXT:    xor r10, r8, r5
+; LE-PWR8-NEXT:    or. r11, r11, r10
+; LE-PWR8-NEXT:    bne cr0, .LBB7_3
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    mr r11, r6
+; LE-PWR8-NEXT:    mr r10, r7
+; LE-PWR8-NEXT:    stqcx. r10, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB7_1
+; LE-PWR8-NEXT:    b .LBB7_4
+; LE-PWR8-NEXT:  .LBB7_3: # %entry
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:  .LBB7_4: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r9
+; LE-PWR8-NEXT:    mr r4, r8
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: cas_weak_acquire_acquire:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    bl .__sync_val_compare_and_swap_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: cas_weak_acquire_acquire:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -48(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 48
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    mr r4, r3
+; PPC-PWR8-NEXT:    lwz r3, 56(r1)
+; PPC-PWR8-NEXT:    lwz r11, 60(r1)
+; PPC-PWR8-NEXT:    stw r8, 44(r1)
+; PPC-PWR8-NEXT:    stw r7, 40(r1)
+; PPC-PWR8-NEXT:    li r7, 2
+; PPC-PWR8-NEXT:    li r8, 2
+; PPC-PWR8-NEXT:    stw r6, 36(r1)
+; PPC-PWR8-NEXT:    stw r5, 32(r1)
+; PPC-PWR8-NEXT:    addi r5, r1, 32
+; PPC-PWR8-NEXT:    addi r6, r1, 16
+; PPC-PWR8-NEXT:    stw r3, 24(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    stw r11, 28(r1)
+; PPC-PWR8-NEXT:    stw r10, 20(r1)
+; PPC-PWR8-NEXT:    stw r9, 16(r1)
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    lwz r0, 52(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 48
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = cmpxchg weak i128* %a, i128 %cmp, i128 %new acquire acquire
   %1 = extractvalue { i128, i1 } %0, 0
@@ -351,6 +1111,76 @@ define i128 @cas_weak_release_monotonic(i128* %a, i128 %cmp, i128 %new) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: cas_weak_release_monotonic:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:  .LBB8_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r8, 0, r3
+; LE-PWR8-NEXT:    xor r11, r9, r4
+; LE-PWR8-NEXT:    xor r10, r8, r5
+; LE-PWR8-NEXT:    or. r11, r11, r10
+; LE-PWR8-NEXT:    bne cr0, .LBB8_3
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    mr r11, r6
+; LE-PWR8-NEXT:    mr r10, r7
+; LE-PWR8-NEXT:    stqcx. r10, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB8_1
+; LE-PWR8-NEXT:    b .LBB8_4
+; LE-PWR8-NEXT:  .LBB8_3: # %entry
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:  .LBB8_4: # %entry
+; LE-PWR8-NEXT:    mr r3, r9
+; LE-PWR8-NEXT:    mr r4, r8
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: cas_weak_release_monotonic:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    bl .__sync_val_compare_and_swap_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: cas_weak_release_monotonic:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -48(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 48
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    mr r4, r3
+; PPC-PWR8-NEXT:    lwz r3, 56(r1)
+; PPC-PWR8-NEXT:    lwz r11, 60(r1)
+; PPC-PWR8-NEXT:    stw r8, 44(r1)
+; PPC-PWR8-NEXT:    stw r7, 40(r1)
+; PPC-PWR8-NEXT:    li r7, 3
+; PPC-PWR8-NEXT:    li r8, 0
+; PPC-PWR8-NEXT:    stw r6, 36(r1)
+; PPC-PWR8-NEXT:    stw r5, 32(r1)
+; PPC-PWR8-NEXT:    addi r5, r1, 32
+; PPC-PWR8-NEXT:    addi r6, r1, 16
+; PPC-PWR8-NEXT:    stw r3, 24(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    stw r11, 28(r1)
+; PPC-PWR8-NEXT:    stw r10, 20(r1)
+; PPC-PWR8-NEXT:    stw r9, 16(r1)
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    lwz r0, 52(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 48
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = cmpxchg weak i128* %a, i128 %cmp, i128 %new release monotonic
   %1 = extractvalue { i128, i1 } %0, 0
@@ -398,6 +1228,78 @@ define i128 @cas_sc_sc(i128* %a, i128 %cmp, i128 %new) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: cas_sc_sc:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    sync
+; LE-PWR8-NEXT:  .LBB9_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r8, 0, r3
+; LE-PWR8-NEXT:    xor r11, r9, r4
+; LE-PWR8-NEXT:    xor r10, r8, r5
+; LE-PWR8-NEXT:    or. r11, r11, r10
+; LE-PWR8-NEXT:    bne cr0, .LBB9_3
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    mr r11, r6
+; LE-PWR8-NEXT:    mr r10, r7
+; LE-PWR8-NEXT:    stqcx. r10, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB9_1
+; LE-PWR8-NEXT:    b .LBB9_4
+; LE-PWR8-NEXT:  .LBB9_3: # %entry
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:  .LBB9_4: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r9
+; LE-PWR8-NEXT:    mr r4, r8
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: cas_sc_sc:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    sync
+; AIX64-PWR8-NEXT:    bl .__sync_val_compare_and_swap_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: cas_sc_sc:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -48(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 48
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    mr r4, r3
+; PPC-PWR8-NEXT:    lwz r3, 56(r1)
+; PPC-PWR8-NEXT:    lwz r11, 60(r1)
+; PPC-PWR8-NEXT:    stw r8, 44(r1)
+; PPC-PWR8-NEXT:    stw r7, 40(r1)
+; PPC-PWR8-NEXT:    li r7, 5
+; PPC-PWR8-NEXT:    li r8, 5
+; PPC-PWR8-NEXT:    stw r6, 36(r1)
+; PPC-PWR8-NEXT:    stw r5, 32(r1)
+; PPC-PWR8-NEXT:    addi r5, r1, 32
+; PPC-PWR8-NEXT:    addi r6, r1, 16
+; PPC-PWR8-NEXT:    stw r3, 24(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    stw r11, 28(r1)
+; PPC-PWR8-NEXT:    stw r10, 20(r1)
+; PPC-PWR8-NEXT:    stw r9, 16(r1)
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    lwz r0, 52(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 48
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = cmpxchg i128* %a, i128 %cmp, i128 %new seq_cst seq_cst
   %1 = extractvalue { i128, i1 } %0, 0
@@ -445,6 +1347,78 @@ define i128 @cas_acqrel_acquire(i128* %a, i128 %cmp, i128 %new) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: cas_acqrel_acquire:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:  .LBB10_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r8, 0, r3
+; LE-PWR8-NEXT:    xor r11, r9, r4
+; LE-PWR8-NEXT:    xor r10, r8, r5
+; LE-PWR8-NEXT:    or. r11, r11, r10
+; LE-PWR8-NEXT:    bne cr0, .LBB10_3
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    mr r11, r6
+; LE-PWR8-NEXT:    mr r10, r7
+; LE-PWR8-NEXT:    stqcx. r10, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB10_1
+; LE-PWR8-NEXT:    b .LBB10_4
+; LE-PWR8-NEXT:  .LBB10_3: # %entry
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:  .LBB10_4: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r9
+; LE-PWR8-NEXT:    mr r4, r8
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: cas_acqrel_acquire:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -112(r1)
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    bl .__sync_val_compare_and_swap_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    addi r1, r1, 112
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: cas_acqrel_acquire:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -48(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 48
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    mr r4, r3
+; PPC-PWR8-NEXT:    lwz r3, 56(r1)
+; PPC-PWR8-NEXT:    lwz r11, 60(r1)
+; PPC-PWR8-NEXT:    stw r8, 44(r1)
+; PPC-PWR8-NEXT:    stw r7, 40(r1)
+; PPC-PWR8-NEXT:    li r7, 4
+; PPC-PWR8-NEXT:    li r8, 2
+; PPC-PWR8-NEXT:    stw r6, 36(r1)
+; PPC-PWR8-NEXT:    stw r5, 32(r1)
+; PPC-PWR8-NEXT:    addi r5, r1, 32
+; PPC-PWR8-NEXT:    addi r6, r1, 16
+; PPC-PWR8-NEXT:    stw r3, 24(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    stw r11, 28(r1)
+; PPC-PWR8-NEXT:    stw r10, 20(r1)
+; PPC-PWR8-NEXT:    stw r9, 16(r1)
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    lwz r0, 52(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 48
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = cmpxchg i128* %a, i128 %cmp, i128 %new acq_rel acquire
   %1 = extractvalue { i128, i1 } %0, 0
@@ -508,6 +1482,88 @@ define i1 @cas_acqrel_acquire_check_succ(i128* %a, i128 %cmp, i128 %new) {
 ; PWR7-NEXT:    ld r0, 16(r1)
 ; PWR7-NEXT:    mtlr r0
 ; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: cas_acqrel_acquire_check_succ:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:  .LBB11_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r8, 0, r3
+; LE-PWR8-NEXT:    xor r11, r9, r4
+; LE-PWR8-NEXT:    xor r10, r8, r5
+; LE-PWR8-NEXT:    or. r11, r11, r10
+; LE-PWR8-NEXT:    bne cr0, .LBB11_3
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    mr r11, r6
+; LE-PWR8-NEXT:    mr r10, r7
+; LE-PWR8-NEXT:    stqcx. r10, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB11_1
+; LE-PWR8-NEXT:    b .LBB11_4
+; LE-PWR8-NEXT:  .LBB11_3: # %entry
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:  .LBB11_4: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    xor r3, r5, r8
+; LE-PWR8-NEXT:    xor r4, r4, r9
+; LE-PWR8-NEXT:    or r3, r4, r3
+; LE-PWR8-NEXT:    cntlzd r3, r3
+; LE-PWR8-NEXT:    rldicl r3, r3, 58, 63
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: cas_acqrel_acquire_check_succ:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    mflr r0
+; AIX64-PWR8-NEXT:    std r0, 16(r1)
+; AIX64-PWR8-NEXT:    stdu r1, -128(r1)
+; AIX64-PWR8-NEXT:    std r30, 112(r1) # 8-byte Folded Spill
+; AIX64-PWR8-NEXT:    std r31, 120(r1) # 8-byte Folded Spill
+; AIX64-PWR8-NEXT:    mr r31, r5
+; AIX64-PWR8-NEXT:    mr r30, r4
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    bl .__sync_val_compare_and_swap_16[PR]
+; AIX64-PWR8-NEXT:    nop
+; AIX64-PWR8-NEXT:    xor r3, r3, r30
+; AIX64-PWR8-NEXT:    xor r4, r4, r31
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    or r3, r4, r3
+; AIX64-PWR8-NEXT:    ld r31, 120(r1) # 8-byte Folded Reload
+; AIX64-PWR8-NEXT:    ld r30, 112(r1) # 8-byte Folded Reload
+; AIX64-PWR8-NEXT:    cntlzd r3, r3
+; AIX64-PWR8-NEXT:    rldicl r3, r3, 58, 63
+; AIX64-PWR8-NEXT:    addi r1, r1, 128
+; AIX64-PWR8-NEXT:    ld r0, 16(r1)
+; AIX64-PWR8-NEXT:    mtlr r0
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: cas_acqrel_acquire_check_succ:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stw r0, 4(r1)
+; PPC-PWR8-NEXT:    stwu r1, -48(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 48
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    mr r4, r3
+; PPC-PWR8-NEXT:    lwz r3, 56(r1)
+; PPC-PWR8-NEXT:    lwz r11, 60(r1)
+; PPC-PWR8-NEXT:    stw r8, 44(r1)
+; PPC-PWR8-NEXT:    stw r7, 40(r1)
+; PPC-PWR8-NEXT:    li r7, 4
+; PPC-PWR8-NEXT:    li r8, 2
+; PPC-PWR8-NEXT:    stw r6, 36(r1)
+; PPC-PWR8-NEXT:    stw r5, 32(r1)
+; PPC-PWR8-NEXT:    addi r5, r1, 32
+; PPC-PWR8-NEXT:    addi r6, r1, 16
+; PPC-PWR8-NEXT:    stw r3, 24(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    stw r11, 28(r1)
+; PPC-PWR8-NEXT:    stw r10, 20(r1)
+; PPC-PWR8-NEXT:    stw r9, 16(r1)
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    lwz r0, 52(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 48
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
 entry:
   %0 = cmpxchg i128* %a, i128 %cmp, i128 %new acq_rel acquire
   %1 = extractvalue { i128, i1 } %0, 1