[clang] 5fdd094 - [clang][CodeGen] Emit atomic IR in place of optimized libcalls. (#73176)

Mon Feb 12 09:33:12 PST 2024

Author: Logikable
Date: 2024-02-12T09:33:09-08:00
New Revision: 5fdd094837c6d8437803ebf3ccc91c3d494a2ac8

URL: https://github.com/llvm/llvm-project/commit/5fdd094837c6d8437803ebf3ccc91c3d494a2ac8
DIFF: https://github.com/llvm/llvm-project/commit/5fdd094837c6d8437803ebf3ccc91c3d494a2ac8.diff

LOG: [clang][CodeGen] Emit atomic IR in place of optimized libcalls. (#73176)

In the beginning, Clang only emitted atomic IR for operations it knew
the
underlying microarch had instructions for, meaning it required
significant
knowledge of the target. Later, the backend acquired the ability to
lower
IR to libcalls. To avoid duplicating logic and improve logic locality,
we'd like to move as much as possible to the backend.

There are many ways to describe this change. For example, this change
reduces the variables Clang uses to decide whether to emit libcalls or
IR, down to only the atomic's size.

Added: 
    

Modified: 
    clang/lib/CodeGen/CGAtomic.cpp
    clang/test/CodeGen/LoongArch/atomics.c
    clang/test/CodeGen/PowerPC/quadword-atomics.c
    clang/test/CodeGen/RISCV/riscv-atomics.c
    clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
    clang/test/CodeGen/arm-atomics-m.c
    clang/test/CodeGen/arm-atomics-m0.c
    clang/test/CodeGen/atomic-ops-libcall.c
    clang/test/CodeGen/atomic-ops.c
    clang/test/CodeGen/atomics-inlining.c
    clang/test/CodeGen/c11atomics.c
    clang/test/CodeGenCXX/atomic-inline.cpp
    clang/test/CodeGenOpenCL/atomic-ops-libcall.cl

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index 52e6ddb7d6afb0..a8d846b4f6a592 100644

--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -811,29 +811,6 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *Expr, Address Dest,
   Builder.SetInsertPoint(ContBB);
 }
 
-static void
-AddDirectArgument(CodeGenFunction &CGF, CallArgList &Args,
-                  bool UseOptimizedLibcall, llvm::Value *Val, QualType ValTy,
-                  SourceLocation Loc, CharUnits SizeInChars) {
-  if (UseOptimizedLibcall) {
-    // Load value and pass it to the function directly.
-    CharUnits Align = CGF.getContext().getTypeAlignInChars(ValTy);
-    int64_t SizeInBits = CGF.getContext().toBits(SizeInChars);
-    ValTy =
-        CGF.getContext().getIntTypeForBitwidth(SizeInBits, /*Signed=*/false);
-    llvm::Type *ITy = llvm::IntegerType::get(CGF.getLLVMContext(), SizeInBits);
-    Address Ptr = Address(Val, ITy, Align);
-    Val = CGF.EmitLoadOfScalar(Ptr, false,
-                               CGF.getContext().getPointerType(ValTy),
-                               Loc);
-    // Coerce the value into an appropriately sized integer type.
-    Args.add(RValue::get(Val), ValTy);
-  } else {
-    // Non-optimized functions always take a reference.
-    Args.add(RValue::get(Val), CGF.getContext().VoidPtrTy);
-  }
-}
-
 RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   QualType AtomicTy = E->getPtr()->getType()->getPointeeType();
   QualType MemTy = AtomicTy;
@@ -857,22 +834,16 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   uint64_t Size = TInfo.Width.getQuantity();
   unsigned MaxInlineWidthInBits = getTarget().getMaxAtomicInlineWidth();
 
-  bool Oversized = getContext().toBits(TInfo.Width) > MaxInlineWidthInBits;
-  bool Misaligned = (Ptr.getAlignment() % TInfo.Width) != 0;
-  bool UseLibcall = Misaligned | Oversized;
-  bool ShouldCastToIntPtrTy = true;
-
   CharUnits MaxInlineWidth =
       getContext().toCharUnitsFromBits(MaxInlineWidthInBits);
-
   DiagnosticsEngine &Diags = CGM.getDiags();
-
+  bool Misaligned = (Ptr.getAlignment() % TInfo.Width) != 0;
+  bool Oversized = getContext().toBits(TInfo.Width) > MaxInlineWidthInBits;
   if (Misaligned) {
     Diags.Report(E->getBeginLoc(), diag::warn_atomic_op_misaligned)
         << (int)TInfo.Width.getQuantity()
         << (int)Ptr.getAlignment().getQuantity();
   }
-
   if (Oversized) {
     Diags.Report(E->getBeginLoc(), diag::warn_atomic_op_oversized)
         << (int)TInfo.Width.getQuantity() << (int)MaxInlineWidth.getQuantity();
@@ -881,6 +852,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   llvm::Value *Order = EmitScalarExpr(E->getOrder());
   llvm::Value *Scope =
       E->getScopeModel() ? EmitScalarExpr(E->getScope()) : nullptr;
+  bool ShouldCastToIntPtrTy = true;
 
   switch (E->getOp()) {
   case AtomicExpr::AO__c11_atomic_init:
@@ -1047,122 +1019,25 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       Dest = Atomics.castToAtomicIntPointer(Dest);
   }
 
-  // Use a library call.  See: http://gcc.gnu.org/wiki/Atomic/GCCMM/LIbrary .
+  bool PowerOf2Size = (Size & (Size - 1)) == 0;
+  bool UseLibcall = !PowerOf2Size || (Size > 16);
+
+  // For atomics larger than 16 bytes, emit a libcall from the frontend. This
+  // avoids the overhead of dealing with excessively-large value types in IR.
+  // Non-power-of-2 values also lower to libcall here, as they are not currently
+  // permitted in IR instructions (although that constraint could be relaxed in
+  // the future). For other cases where a libcall is required on a given
+  // platform, we let the backend handle it (this includes handling for all of
+  // the size-optimized libcall variants, which are only valid up to 16 bytes.)
+  //
+  // See: https://llvm.org/docs/Atomics.html#libcalls-atomic
   if (UseLibcall) {
-    bool UseOptimizedLibcall = false;
-    switch (E->getOp()) {
-    case AtomicExpr::AO__c11_atomic_init:
-    case AtomicExpr::AO__opencl_atomic_init:
-      llvm_unreachable("Already handled above with EmitAtomicInit!");
-
-    case AtomicExpr::AO__atomic_fetch_add:
-    case AtomicExpr::AO__atomic_fetch_and:
-    case AtomicExpr::AO__atomic_fetch_max:
-    case AtomicExpr::AO__atomic_fetch_min:
-    case AtomicExpr::AO__atomic_fetch_nand:
-    case AtomicExpr::AO__atomic_fetch_or:
-    case AtomicExpr::AO__atomic_fetch_sub:
-    case AtomicExpr::AO__atomic_fetch_xor:
-    case AtomicExpr::AO__atomic_add_fetch:
-    case AtomicExpr::AO__atomic_and_fetch:
-    case AtomicExpr::AO__atomic_max_fetch:
-    case AtomicExpr::AO__atomic_min_fetch:
-    case AtomicExpr::AO__atomic_nand_fetch:
-    case AtomicExpr::AO__atomic_or_fetch:
-    case AtomicExpr::AO__atomic_sub_fetch:
-    case AtomicExpr::AO__atomic_xor_fetch:
-    case AtomicExpr::AO__c11_atomic_fetch_add:
-    case AtomicExpr::AO__c11_atomic_fetch_and:
-    case AtomicExpr::AO__c11_atomic_fetch_max:
-    case AtomicExpr::AO__c11_atomic_fetch_min:
-    case AtomicExpr::AO__c11_atomic_fetch_nand:
-    case AtomicExpr::AO__c11_atomic_fetch_or:
-    case AtomicExpr::AO__c11_atomic_fetch_sub:
-    case AtomicExpr::AO__c11_atomic_fetch_xor:
-    case AtomicExpr::AO__hip_atomic_fetch_add:
-    case AtomicExpr::AO__hip_atomic_fetch_and:
-    case AtomicExpr::AO__hip_atomic_fetch_max:
-    case AtomicExpr::AO__hip_atomic_fetch_min:
-    case AtomicExpr::AO__hip_atomic_fetch_or:
-    case AtomicExpr::AO__hip_atomic_fetch_sub:
-    case AtomicExpr::AO__hip_atomic_fetch_xor:
-    case AtomicExpr::AO__opencl_atomic_fetch_add:
-    case AtomicExpr::AO__opencl_atomic_fetch_and:
-    case AtomicExpr::AO__opencl_atomic_fetch_max:
-    case AtomicExpr::AO__opencl_atomic_fetch_min:
-    case AtomicExpr::AO__opencl_atomic_fetch_or:
-    case AtomicExpr::AO__opencl_atomic_fetch_sub:
-    case AtomicExpr::AO__opencl_atomic_fetch_xor:
-    case AtomicExpr::AO__scoped_atomic_fetch_add:
-    case AtomicExpr::AO__scoped_atomic_fetch_and:
-    case AtomicExpr::AO__scoped_atomic_fetch_max:
-    case AtomicExpr::AO__scoped_atomic_fetch_min:
-    case AtomicExpr::AO__scoped_atomic_fetch_nand:
-    case AtomicExpr::AO__scoped_atomic_fetch_or:
-    case AtomicExpr::AO__scoped_atomic_fetch_sub:
-    case AtomicExpr::AO__scoped_atomic_fetch_xor:
-    case AtomicExpr::AO__scoped_atomic_add_fetch:
-    case AtomicExpr::AO__scoped_atomic_and_fetch:
-    case AtomicExpr::AO__scoped_atomic_max_fetch:
-    case AtomicExpr::AO__scoped_atomic_min_fetch:
-    case AtomicExpr::AO__scoped_atomic_nand_fetch:
-    case AtomicExpr::AO__scoped_atomic_or_fetch:
-    case AtomicExpr::AO__scoped_atomic_sub_fetch:
-    case AtomicExpr::AO__scoped_atomic_xor_fetch:
-      // For these, only library calls for certain sizes exist.
-      UseOptimizedLibcall = true;
-      break;
-
-    case AtomicExpr::AO__atomic_load:
-    case AtomicExpr::AO__atomic_store:
-    case AtomicExpr::AO__atomic_exchange:
-    case AtomicExpr::AO__atomic_compare_exchange:
-    case AtomicExpr::AO__scoped_atomic_load:
-    case AtomicExpr::AO__scoped_atomic_store:
-    case AtomicExpr::AO__scoped_atomic_exchange:
-    case AtomicExpr::AO__scoped_atomic_compare_exchange:
-      // Use the generic version if we don't know that the operand will be
-      // suitably aligned for the optimized version.
-      if (Misaligned)
-        break;
-      [[fallthrough]];
-    case AtomicExpr::AO__atomic_load_n:
-    case AtomicExpr::AO__atomic_store_n:
-    case AtomicExpr::AO__atomic_exchange_n:
-    case AtomicExpr::AO__atomic_compare_exchange_n:
-    case AtomicExpr::AO__c11_atomic_load:
-    case AtomicExpr::AO__c11_atomic_store:
-    case AtomicExpr::AO__c11_atomic_exchange:
-    case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
-    case AtomicExpr::AO__c11_atomic_compare_exchange_strong:
-    case AtomicExpr::AO__hip_atomic_load:
-    case AtomicExpr::AO__hip_atomic_store:
-    case AtomicExpr::AO__hip_atomic_exchange:
-    case AtomicExpr::AO__hip_atomic_compare_exchange_weak:
-    case AtomicExpr::AO__hip_atomic_compare_exchange_strong:
-    case AtomicExpr::AO__opencl_atomic_load:
-    case AtomicExpr::AO__opencl_atomic_store:
-    case AtomicExpr::AO__opencl_atomic_exchange:
-    case AtomicExpr::AO__opencl_atomic_compare_exchange_weak:
-    case AtomicExpr::AO__opencl_atomic_compare_exchange_strong:
-    case AtomicExpr::AO__scoped_atomic_load_n:
-    case AtomicExpr::AO__scoped_atomic_store_n:
-    case AtomicExpr::AO__scoped_atomic_exchange_n:
-    case AtomicExpr::AO__scoped_atomic_compare_exchange_n:
-      // Only use optimized library calls for sizes for which they exist.
-      // FIXME: Size == 16 optimized library functions exist too.
-      if (Size == 1 || Size == 2 || Size == 4 || Size == 8)
-        UseOptimizedLibcall = true;
-      break;
-    }
-
     CallArgList Args;
-    if (!UseOptimizedLibcall) {
-      // For non-optimized library calls, the size is the first parameter
-      Args.add(RValue::get(llvm::ConstantInt::get(SizeTy, Size)),
-               getContext().getSizeType());
-    }
-    // Atomic address is the first or second parameter
+    // For non-optimized library calls, the size is the first parameter.
+    Args.add(RValue::get(llvm::ConstantInt::get(SizeTy, Size)),
+             getContext().getSizeType());
+
+    // The atomic address is the second parameter.
     // The OpenCL atomic library functions only accept pointer arguments to
     // generic address space.
     auto CastToGenericAddrSpace = [&](llvm::Value *V, QualType PT) {
@@ -1177,18 +1052,14 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       return getTargetHooks().performAddrSpaceCast(
           *this, V, AS, LangAS::opencl_generic, DestType, false);
     };
-
     Args.add(RValue::get(CastToGenericAddrSpace(Ptr.getPointer(),
                                                 E->getPtr()->getType())),
              getContext().VoidPtrTy);
 
+    // The next 1-3 parameters are op-dependent.
     std::string LibCallName;
-    QualType LoweredMemTy =
-      MemTy->isPointerType() ? getContext().getIntPtrType() : MemTy;
     QualType RetTy;
     bool HaveRetTy = false;
-    llvm::Instruction::BinaryOps PostOp = (llvm::Instruction::BinaryOps)0;
-    bool PostOpMinMax = false;
     switch (E->getOp()) {
     case AtomicExpr::AO__c11_atomic_init:
     case AtomicExpr::AO__opencl_atomic_init:
@@ -1199,8 +1070,6 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     // and exchange.
     // bool __atomic_compare_exchange(size_t size, void *mem, void *expected,
     //                                void *desired, int success, int failure)
-    // bool __atomic_compare_exchange_N(T *mem, T *expected, T desired,
-    //                                  int success, int failure)
     case AtomicExpr::AO__atomic_compare_exchange:
     case AtomicExpr::AO__atomic_compare_exchange_n:
     case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
@@ -1217,14 +1086,14 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       Args.add(RValue::get(CastToGenericAddrSpace(Val1.getPointer(),
                                                   E->getVal1()->getType())),
                getContext().VoidPtrTy);
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val2.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
+      Args.add(RValue::get(CastToGenericAddrSpace(Val2.getPointer(),
+                                                  E->getVal2()->getType())),
+               getContext().VoidPtrTy);
       Args.add(RValue::get(Order), getContext().IntTy);
       Order = OrderFail;
       break;
     // void __atomic_exchange(size_t size, void *mem, void *val, void *return,
     //                        int order)
-    // T __atomic_exchange_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_exchange:
     case AtomicExpr::AO__atomic_exchange_n:
     case AtomicExpr::AO__c11_atomic_exchange:
@@ -1233,11 +1102,11 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     case AtomicExpr::AO__scoped_atomic_exchange:
     case AtomicExpr::AO__scoped_atomic_exchange_n:
       LibCallName = "__atomic_exchange";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
+      Args.add(RValue::get(CastToGenericAddrSpace(Val1.getPointer(),
+                                                  E->getVal1()->getType())),
+               getContext().VoidPtrTy);
       break;
     // void __atomic_store(size_t size, void *mem, void *val, int order)
-    // void __atomic_store_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_store:
     case AtomicExpr::AO__atomic_store_n:
     case AtomicExpr::AO__c11_atomic_store:
@@ -1248,11 +1117,11 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
       LibCallName = "__atomic_store";
       RetTy = getContext().VoidTy;
       HaveRetTy = true;
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
+      Args.add(RValue::get(CastToGenericAddrSpace(Val1.getPointer(),
+                                                  E->getVal1()->getType())),
+               getContext().VoidPtrTy);
       break;
     // void __atomic_load(size_t size, void *mem, void *return, int order)
-    // T __atomic_load_N(T *mem, int order)
     case AtomicExpr::AO__atomic_load:
     case AtomicExpr::AO__atomic_load_n:
     case AtomicExpr::AO__c11_atomic_load:
@@ -1262,183 +1131,85 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     case AtomicExpr::AO__scoped_atomic_load_n:
       LibCallName = "__atomic_load";
       break;
-    // T __atomic_add_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_add_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_add_fetch:
     case AtomicExpr::AO__scoped_atomic_add_fetch:
-      PostOp = llvm::Instruction::Add;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_add:
     case AtomicExpr::AO__c11_atomic_fetch_add:
     case AtomicExpr::AO__hip_atomic_fetch_add:
     case AtomicExpr::AO__opencl_atomic_fetch_add:
     case AtomicExpr::AO__scoped_atomic_fetch_add:
-      LibCallName = "__atomic_fetch_add";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_and_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_and_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_and_fetch:
     case AtomicExpr::AO__scoped_atomic_and_fetch:
-      PostOp = llvm::Instruction::And;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_and:
     case AtomicExpr::AO__c11_atomic_fetch_and:
     case AtomicExpr::AO__hip_atomic_fetch_and:
     case AtomicExpr::AO__opencl_atomic_fetch_and:
     case AtomicExpr::AO__scoped_atomic_fetch_and:
-      LibCallName = "__atomic_fetch_and";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_or_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_or_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_or_fetch:
     case AtomicExpr::AO__scoped_atomic_or_fetch:
-      PostOp = llvm::Instruction::Or;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_or:
     case AtomicExpr::AO__c11_atomic_fetch_or:
     case AtomicExpr::AO__hip_atomic_fetch_or:
     case AtomicExpr::AO__opencl_atomic_fetch_or:
     case AtomicExpr::AO__scoped_atomic_fetch_or:
-      LibCallName = "__atomic_fetch_or";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_sub_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_sub_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_sub_fetch:
     case AtomicExpr::AO__scoped_atomic_sub_fetch:
-      PostOp = llvm::Instruction::Sub;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_sub:
     case AtomicExpr::AO__c11_atomic_fetch_sub:
     case AtomicExpr::AO__hip_atomic_fetch_sub:
     case AtomicExpr::AO__opencl_atomic_fetch_sub:
     case AtomicExpr::AO__scoped_atomic_fetch_sub:
-      LibCallName = "__atomic_fetch_sub";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_xor_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_xor_N(T *mem, T val, int order)
     case AtomicExpr::AO__atomic_xor_fetch:
     case AtomicExpr::AO__scoped_atomic_xor_fetch:
-      PostOp = llvm::Instruction::Xor;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_xor:
     case AtomicExpr::AO__c11_atomic_fetch_xor:
     case AtomicExpr::AO__hip_atomic_fetch_xor:
     case AtomicExpr::AO__opencl_atomic_fetch_xor:
     case AtomicExpr::AO__scoped_atomic_fetch_xor:
-      LibCallName = "__atomic_fetch_xor";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
+    case AtomicExpr::AO__atomic_nand_fetch:
+    case AtomicExpr::AO__atomic_fetch_nand:
+    case AtomicExpr::AO__c11_atomic_fetch_nand:
+    case AtomicExpr::AO__scoped_atomic_fetch_nand:
+    case AtomicExpr::AO__scoped_atomic_nand_fetch:
     case AtomicExpr::AO__atomic_min_fetch:
-    case AtomicExpr::AO__scoped_atomic_min_fetch:
-      PostOpMinMax = true;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_min:
     case AtomicExpr::AO__c11_atomic_fetch_min:
-    case AtomicExpr::AO__scoped_atomic_fetch_min:
     case AtomicExpr::AO__hip_atomic_fetch_min:
     case AtomicExpr::AO__opencl_atomic_fetch_min:
-      LibCallName = E->getValueType()->isSignedIntegerType()
-                        ? "__atomic_fetch_min"
-                        : "__atomic_fetch_umin";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
+    case AtomicExpr::AO__scoped_atomic_fetch_min:
+    case AtomicExpr::AO__scoped_atomic_min_fetch:
     case AtomicExpr::AO__atomic_max_fetch:
-    case AtomicExpr::AO__scoped_atomic_max_fetch:
-      PostOpMinMax = true;
-      [[fallthrough]];
     case AtomicExpr::AO__atomic_fetch_max:
     case AtomicExpr::AO__c11_atomic_fetch_max:
     case AtomicExpr::AO__hip_atomic_fetch_max:
     case AtomicExpr::AO__opencl_atomic_fetch_max:
     case AtomicExpr::AO__scoped_atomic_fetch_max:
-      LibCallName = E->getValueType()->isSignedIntegerType()
-                        ? "__atomic_fetch_max"
-                        : "__atomic_fetch_umax";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        LoweredMemTy, E->getExprLoc(), TInfo.Width);
-      break;
-    // T __atomic_nand_fetch_N(T *mem, T val, int order)
-    // T __atomic_fetch_nand_N(T *mem, T val, int order)
-    case AtomicExpr::AO__atomic_nand_fetch:
-    case AtomicExpr::AO__scoped_atomic_nand_fetch:
-      PostOp = llvm::Instruction::And; // the NOT is special cased below
-      [[fallthrough]];
-    case AtomicExpr::AO__atomic_fetch_nand:
-    case AtomicExpr::AO__c11_atomic_fetch_nand:
-    case AtomicExpr::AO__scoped_atomic_fetch_nand:
-      LibCallName = "__atomic_fetch_nand";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(),
-                        MemTy, E->getExprLoc(), TInfo.Width);
-      break;
+    case AtomicExpr::AO__scoped_atomic_max_fetch:
+      llvm_unreachable("Integral atomic operations always become atomicrmw!");
     }
 
     if (E->isOpenCL()) {
-      LibCallName = std::string("__opencl") +
-          StringRef(LibCallName).drop_front(1).str();
-
+      LibCallName =
+          std::string("__opencl") + StringRef(LibCallName).drop_front(1).str();
     }
-    // Optimized functions have the size in their name.
-    if (UseOptimizedLibcall)
-      LibCallName += "_" + llvm::utostr(Size);
     // By default, assume we return a value of the atomic type.
     if (!HaveRetTy) {
-      if (UseOptimizedLibcall) {
-        // Value is returned directly.
-        // The function returns an appropriately sized integer type.
-        RetTy = getContext().getIntTypeForBitwidth(
-            getContext().toBits(TInfo.Width), /*Signed=*/false);
-      } else {
-        // Value is returned through parameter before the order.
-        RetTy = getContext().VoidTy;
-        Args.add(RValue::get(Dest.getPointer()), getContext().VoidPtrTy);
-      }
+      // Value is returned through parameter before the order.
+      RetTy = getContext().VoidTy;
+      Args.add(RValue::get(CastToGenericAddrSpace(Dest.getPointer(), RetTy)),
+               getContext().VoidPtrTy);
     }
-    // order is always the last parameter
+    // Order is always the last parameter.
     Args.add(RValue::get(Order),
              getContext().IntTy);
     if (E->isOpenCL())
       Args.add(RValue::get(Scope), getContext().IntTy);
 
-    // PostOp is only needed for the atomic_*_fetch operations, and
-    // thus is only needed for and implemented in the
-    // UseOptimizedLibcall codepath.
-    assert(UseOptimizedLibcall || (!PostOp && !PostOpMinMax));
-
     RValue Res = emitAtomicLibcall(*this, LibCallName, RetTy, Args);
     // The value is returned directly from the libcall.
     if (E->isCmpXChg())
       return Res;
 
-    // The value is returned directly for optimized libcalls but the expr
-    // provided an out-param.
-    if (UseOptimizedLibcall && Res.getScalarVal()) {
-      llvm::Value *ResVal = Res.getScalarVal();
-      if (PostOpMinMax) {
-        llvm::Value *LoadVal1 = Args[1].getRValue(*this).getScalarVal();
-        ResVal = EmitPostAtomicMinMax(Builder, E->getOp(),
-                                      E->getValueType()->isSignedIntegerType(),
-                                      ResVal, LoadVal1);
-      } else if (PostOp) {
-        llvm::Value *LoadVal1 = Args[1].getRValue(*this).getScalarVal();
-        ResVal = Builder.CreateBinOp(PostOp, ResVal, LoadVal1);
-      }
-      if (E->getOp() == AtomicExpr::AO__atomic_nand_fetch ||
-          E->getOp() == AtomicExpr::AO__scoped_atomic_nand_fetch)
-        ResVal = Builder.CreateNot(ResVal);
-
-      Builder.CreateStore(ResVal, Dest.withElementType(ResVal->getType()));
-    }
-
     if (RValTy->isVoidType())
       return RValue::get(nullptr);
 

diff  --git a/clang/test/CodeGen/LoongArch/atomics.c b/clang/test/CodeGen/LoongArch/atomics.c
index edc58d30db186d..bd51fea661be1f 100644
--- a/clang/test/CodeGen/LoongArch/atomics.c
+++ b/clang/test/CodeGen/LoongArch/atomics.c
@@ -11,10 +11,10 @@
 void test_i8_atomics(_Atomic(int8_t) * a, int8_t b) {
   // LA32: load atomic i8, ptr %a seq_cst, align 1
   // LA32: store atomic i8 %b, ptr %a seq_cst, align 1
-  // LA32: atomicrmw add ptr %a, i8 %b seq_cst
+  // LA32: atomicrmw add ptr %a, i8 %b seq_cst, align 1
   // LA64: load atomic i8, ptr %a seq_cst, align 1
   // LA64: store atomic i8 %b, ptr %a seq_cst, align 1
-  // LA64: atomicrmw add ptr %a, i8 %b seq_cst
+  // LA64: atomicrmw add ptr %a, i8 %b seq_cst, align 1
   __c11_atomic_load(a, memory_order_seq_cst);
   __c11_atomic_store(a, b, memory_order_seq_cst);
   __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
@@ -23,22 +23,22 @@ void test_i8_atomics(_Atomic(int8_t) * a, int8_t b) {
 void test_i32_atomics(_Atomic(int32_t) * a, int32_t b) {
   // LA32: load atomic i32, ptr %a seq_cst, align 4
   // LA32: store atomic i32 %b, ptr %a seq_cst, align 4
-  // LA32: atomicrmw add ptr %a, i32 %b seq_cst
+  // LA32: atomicrmw add ptr %a, i32 %b seq_cst, align 4
   // LA64: load atomic i32, ptr %a seq_cst, align 4
   // LA64: store atomic i32 %b, ptr %a seq_cst, align 4
-  // LA64: atomicrmw add ptr %a, i32 %b seq_cst
+  // LA64: atomicrmw add ptr %a, i32 %b seq_cst, align 4
   __c11_atomic_load(a, memory_order_seq_cst);
   __c11_atomic_store(a, b, memory_order_seq_cst);
   __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
 }
 
 void test_i64_atomics(_Atomic(int64_t) * a, int64_t b) {
-  // LA32: call i64 @__atomic_load_8
-  // LA32: call void @__atomic_store_8
-  // LA32: call i64 @__atomic_fetch_add_8
+  // LA32: load atomic i64, ptr %a seq_cst, align 8
+  // LA32: store atomic i64 %b, ptr %a seq_cst, align 8
+  // LA32: atomicrmw add ptr %a, i64 %b seq_cst, align 8
   // LA64: load atomic i64, ptr %a seq_cst, align 8
   // LA64: store atomic i64 %b, ptr %a seq_cst, align 8
-  // LA64: atomicrmw add ptr %a, i64 %b seq_cst
+  // LA64: atomicrmw add ptr %a, i64 %b seq_cst, align 8
   __c11_atomic_load(a, memory_order_seq_cst);
   __c11_atomic_store(a, b, memory_order_seq_cst);
   __c11_atomic_fetch_add(a, b, memory_order_seq_cst);

diff  --git a/clang/test/CodeGen/PowerPC/quadword-atomics.c b/clang/test/CodeGen/PowerPC/quadword-atomics.c
index bff03b25d27ee9..dc04423060a03b 100644
--- a/clang/test/CodeGen/PowerPC/quadword-atomics.c
+++ b/clang/test/CodeGen/PowerPC/quadword-atomics.c
@@ -1,14 +1,18 @@
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64le-linux-gnu \
-// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64-QUADWORD-ATOMICS
+// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64le-linux-gnu \
-// RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN:   -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-NO-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
-// RUN:   -target-cpu pwr7 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN:   -target-cpu pwr7 -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-NO-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
-// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC64
+// RUN:   -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s \
+// RUN:   --check-prefixes=PPC64,PPC64-NO-QUADWORD-ATOMICS
 // RUN: %clang_cc1 -Werror -Wno-atomic-alignment -triple powerpc64-unknown-aix \
-// RUN:   -mabi=quadword-atomics -target-cpu pwr8 -emit-llvm -o - %s | FileCheck %s \
-// RUN:   --check-prefix=PPC64-QUADWORD-ATOMICS
+// RUN:   -mabi=quadword-atomics -target-cpu pwr8 -emit-llvm -o - %s | \
+// RUN:   FileCheck %s --check-prefixes=PPC64,PPC64-QUADWORD-ATOMICS
 
 
 typedef struct {
@@ -19,66 +23,48 @@ typedef _Atomic(Q) AtomicQ;
 
 typedef __int128_t int128_t;
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_load(
-// PPC64-QUADWORD-ATOMICS:    [[TMP3:%.*]] = load atomic i128, ptr [[TMP1:%.*]] acquire, align 16
-//
 // PPC64-LABEL: @test_load(
-// PPC64:    call void @__atomic_load(i64 noundef 16, ptr noundef [[TMP3:%.*]], ptr noundef [[TMP4:%.*]], i32 noundef signext 2)
+// PPC64:    [[TMP3:%.*]] = load atomic i128, ptr [[TMP1:%.*]] acquire, align 16
 //
 Q test_load(AtomicQ *ptr) {
   // expected-no-diagnostics
   return __c11_atomic_load(ptr, __ATOMIC_ACQUIRE);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_store(
-// PPC64-QUADWORD-ATOMICS:    store atomic i128 [[TMP6:%.*]], ptr [[TMP4:%.*]] release, align 16
-//
 // PPC64-LABEL: @test_store(
-// PPC64:    call void @__atomic_store(i64 noundef 16, ptr noundef [[TMP6:%.*]], ptr noundef [[TMP7:%.*]], i32 noundef signext 3)
+// PPC64:    store atomic i128 [[TMP6:%.*]], ptr [[TMP4:%.*]] release, align 16
 //
 void test_store(Q val, AtomicQ *ptr) {
   // expected-no-diagnostics
   __c11_atomic_store(ptr, val, __ATOMIC_RELEASE);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_add(
-// PPC64-QUADWORD-ATOMICS:    [[TMP3:%.*]] = atomicrmw add ptr [[TMP0:%.*]], i128 [[TMP2:%.*]] monotonic, align 16
-//
 // PPC64-LABEL: @test_add(
-// PPC64:    [[CALL:%.*]] = call i128 @__atomic_fetch_add_16(ptr noundef [[TMP2:%.*]], i128 noundef [[TMP3:%.*]], i32 noundef signext 0)
+// PPC64:    [[ATOMICRMW:%.*]] = atomicrmw add ptr [[TMP0:%.*]], i128 [[TMP2:%.*]] monotonic, align 16
 //
 void test_add(_Atomic(int128_t) *ptr, int128_t x) {
   // expected-no-diagnostics
   __c11_atomic_fetch_add(ptr, x, __ATOMIC_RELAXED);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_xchg(
-// PPC64-QUADWORD-ATOMICS:    [[TMP8:%.*]] = atomicrmw xchg ptr [[TMP4:%.*]], i128 [[TMP7:%.*]] seq_cst, align 16
-//
 // PPC64-LABEL: @test_xchg(
-// PPC64:    call void @__atomic_exchange(i64 noundef 16, ptr noundef [[TMP7:%.*]], ptr noundef [[TMP8:%.*]], ptr noundef [[TMP9:%.*]], i32 noundef signext 5)
+// PPC64:    [[TMP8:%.*]] = atomicrmw xchg ptr [[TMP4:%.*]], i128 [[TMP7:%.*]] seq_cst, align 16
 //
 Q test_xchg(AtomicQ *ptr, Q new) {
   // expected-no-diagnostics
   return __c11_atomic_exchange(ptr, new, __ATOMIC_SEQ_CST);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_cmpxchg(
-// PPC64-QUADWORD-ATOMICS:    [[TMP10:%.*]] = cmpxchg ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
-//
 // PPC64-LABEL: @test_cmpxchg(
-// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP8:%.*]], ptr noundef [[TMP9:%.*]], ptr noundef [[TMP10:%.*]], i32 noundef signext 5, i32 noundef signext 0)
+// PPC64:    [[TMP10:%.*]] = cmpxchg ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
 //
 int test_cmpxchg(AtomicQ *ptr, Q *cmp, Q new) {
   // expected-no-diagnostics
   return __c11_atomic_compare_exchange_strong(ptr, cmp, new, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
 }
 
-// PPC64-QUADWORD-ATOMICS-LABEL: @test_cmpxchg_weak(
-// PPC64-QUADWORD-ATOMICS:    [[TMP10:%.*]] = cmpxchg weak ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
-//
 // PPC64-LABEL: @test_cmpxchg_weak(
-// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP8:%.*]], ptr noundef [[TMP9:%.*]], ptr noundef [[TMP10:%.*]], i32 noundef signext 5, i32 noundef signext 0)
+// PPC64:    [[TMP10:%.*]] = cmpxchg weak ptr [[TMP5:%.*]], i128 [[TMP8:%.*]], i128 [[TMP9:%.*]] seq_cst monotonic, align 16
 //
 int test_cmpxchg_weak(AtomicQ *ptr, Q *cmp, Q new) {
   // expected-no-diagnostics
@@ -88,8 +74,8 @@ int test_cmpxchg_weak(AtomicQ *ptr, Q *cmp, Q new) {
 // PPC64-QUADWORD-ATOMICS-LABEL: @is_lock_free(
 // PPC64-QUADWORD-ATOMICS:    ret i32 1
 //
-// PPC64-LABEL: @is_lock_free(
-// PPC64:    [[CALL:%.*]] = call zeroext i1 @__atomic_is_lock_free(i64 noundef 16, ptr noundef null)
+// PPC64-NO-QUADWORD-ATOMICS-LABEL: @is_lock_free(
+// PPC64-NO-QUADWORD-ATOMICS:    [[CALL:%.*]] = call zeroext i1 @__atomic_is_lock_free(i64 noundef 16, ptr noundef null)
 //
 int is_lock_free() {
   AtomicQ q;

diff  --git a/clang/test/CodeGen/RISCV/riscv-atomics.c b/clang/test/CodeGen/RISCV/riscv-atomics.c
index f629ad7d72ea82..437cb949bbb0fe 100644
--- a/clang/test/CodeGen/RISCV/riscv-atomics.c
+++ b/clang/test/CodeGen/RISCV/riscv-atomics.c
@@ -1,68 +1,34 @@
 // RUN: %clang_cc1 -triple riscv32 -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV32I
+// RUN:   -verify=no-atomics
 // RUN: %clang_cc1 -triple riscv32 -target-feature +a -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV32IA
+// RUN:   -verify=small-atomics
 // RUN: %clang_cc1 -triple riscv64 -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV64I
+// RUN:   -verify=no-atomics
 // RUN: %clang_cc1 -triple riscv64 -target-feature +a -O1 -emit-llvm %s -o - \
-// RUN:   | FileCheck %s -check-prefix=RV64IA
+// RUN:   -verify=all-atomics
 
-// This test demonstrates that MaxAtomicInlineWidth is set appropriately when
-// the atomics instruction set extension is enabled.
+// all-atomics-no-diagnostics
 
 #include <stdatomic.h>
 #include <stdint.h>
 
 void test_i8_atomics(_Atomic(int8_t) * a, int8_t b) {
-  // RV32I:  call zeroext i8 @__atomic_load_1
-  // RV32I:  call void @__atomic_store_1
-  // RV32I:  call zeroext i8 @__atomic_fetch_add_1
-  // RV32IA: load atomic i8, ptr %a seq_cst, align 1
-  // RV32IA: store atomic i8 %b, ptr %a seq_cst, align 1
-  // RV32IA: atomicrmw add ptr %a, i8 %b seq_cst, align 1
-  // RV64I:  call zeroext i8 @__atomic_load_1
-  // RV64I:  call void @__atomic_store_1
-  // RV64I:  call zeroext i8 @__atomic_fetch_add_1
-  // RV64IA: load atomic i8, ptr %a seq_cst, align 1
-  // RV64IA: store atomic i8 %b, ptr %a seq_cst, align 1
-  // RV64IA: atomicrmw add ptr %a, i8 %b seq_cst, align 1
-  __c11_atomic_load(a, memory_order_seq_cst);
-  __c11_atomic_store(a, b, memory_order_seq_cst);
-  __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
+  __c11_atomic_load(a, memory_order_seq_cst);         // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (1 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_store(a, b, memory_order_seq_cst);     // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (1 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_fetch_add(a, b, memory_order_seq_cst); // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (1 bytes) exceeds the max lock-free size (0  bytes)}}
 }
 
 void test_i32_atomics(_Atomic(int32_t) * a, int32_t b) {
-  // RV32I:  call i32 @__atomic_load_4
-  // RV32I:  call void @__atomic_store_4
-  // RV32I:  call i32 @__atomic_fetch_add_4
-  // RV32IA: load atomic i32, ptr %a seq_cst, align 4
-  // RV32IA: store atomic i32 %b, ptr %a seq_cst, align 4
-  // RV32IA: atomicrmw add ptr %a, i32 %b seq_cst, align 4
-  // RV64I:  call signext i32 @__atomic_load_4
-  // RV64I:  call void @__atomic_store_4
-  // RV64I:  call signext i32 @__atomic_fetch_add_4
-  // RV64IA: load atomic i32, ptr %a seq_cst, align 4
-  // RV64IA: store atomic i32 %b, ptr %a seq_cst, align 4
-  // RV64IA: atomicrmw add ptr %a, i32 %b seq_cst, align 4
-  __c11_atomic_load(a, memory_order_seq_cst);
-  __c11_atomic_store(a, b, memory_order_seq_cst);
-  __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
+  __c11_atomic_load(a, memory_order_seq_cst);         // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_store(a, b, memory_order_seq_cst);     // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0  bytes)}}
+  __c11_atomic_fetch_add(a, b, memory_order_seq_cst); // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (4 bytes) exceeds the max lock-free size (0  bytes)}}
 }
 
 void test_i64_atomics(_Atomic(int64_t) * a, int64_t b) {
-  // RV32I:  call i64 @__atomic_load_8
-  // RV32I:  call void @__atomic_store_8
-  // RV32I:  call i64 @__atomic_fetch_add_8
-  // RV32IA: call i64 @__atomic_load_8
-  // RV32IA: call void @__atomic_store_8
-  // RV32IA: call i64 @__atomic_fetch_add_8
-  // RV64I:  call i64 @__atomic_load_8
-  // RV64I:  call void @__atomic_store_8
-  // RV64I:  call i64 @__atomic_fetch_add_8
-  // RV64IA: load atomic i64, ptr %a seq_cst, align 8
-  // RV64IA: store atomic i64 %b, ptr %a seq_cst, align 8
-  // RV64IA: atomicrmw add ptr %a, i64 %b seq_cst, align 8
-  __c11_atomic_load(a, memory_order_seq_cst);
-  __c11_atomic_store(a, b, memory_order_seq_cst);
-  __c11_atomic_fetch_add(a, b, memory_order_seq_cst);
+  __c11_atomic_load(a, memory_order_seq_cst);         // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (0  bytes)}}
+                                                      // small-atomics-warning at 28 {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (4  bytes)}}
+  __c11_atomic_store(a, b, memory_order_seq_cst);     // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (0  bytes)}}
+                                                      // small-atomics-warning at 30 {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (4  bytes)}}
+  __c11_atomic_fetch_add(a, b, memory_order_seq_cst); // no-atomics-warning {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (0  bytes)}}
+                                                      // small-atomics-warning at 32 {{large atomic operation may incur significant performance penalty; the access size (8 bytes) exceeds the max lock-free size (4  bytes)}}
 }

diff  --git a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
index 4f6dcbc2c01ec8..8759df7b19c638 100644
--- a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
+++ b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
@@ -20,7 +20,8 @@ __int128 Des;
 
 // CHECK-LABEL: @f1(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_load(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull [[AGG_RESULT:%.*]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __int128 f1() {
@@ -29,8 +30,8 @@ __int128 f1() {
 
 // CHECK-LABEL: @f2(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_load(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Ret, i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Ret, align 8, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP0]], ptr @Ret, align 8
 // CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
@@ -41,10 +42,8 @@ __int128 f2() {
 
 // CHECK-LABEL: @f3(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[DOTATOMICTMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_store(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull [[DOTATOMICTMP]], i32 noundef signext 5)
+// CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    ret void
 //
 void f3() {
@@ -53,7 +52,8 @@ void f3() {
 
 // CHECK-LABEL: @f4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_store(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Val, i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8
+// CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    ret void
 //
 void f4() {
@@ -62,10 +62,9 @@ void f4() {
 
 // CHECK-LABEL: @f5(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[DOTATOMICTMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull [[DOTATOMICTMP]], ptr noundef nonnull [[AGG_RESULT:%.*]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f5() {
@@ -74,9 +73,10 @@ __int128 f5() {
 
 // CHECK-LABEL: @f6(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__atomic_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Val, ptr noundef nonnull @Ret, i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Ret, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr @Ret, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f6() {
@@ -86,11 +86,17 @@ __int128 f6() {
 
 // CHECK-LABEL: @f7(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[DOTATOMICTMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Exp, ptr noundef nonnull [[DOTATOMICTMP]], i32 noundef signext 5, i32 noundef signext 5)
-// CHECK-NEXT:    ret i1 [[CALL]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Exp, align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP1]], i128 [[TMP0]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
+// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
+// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
+// CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 8
+// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
+// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f7() {
   return __atomic_compare_exchange_n(&Ptr, &Exp, Des, 0,
@@ -99,8 +105,17 @@ _Bool f7() {
 
 // CHECK-LABEL: @f8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef nonnull @Ptr, ptr noundef nonnull @Exp, ptr noundef nonnull @Des, i32 noundef signext 5, i32 noundef signext 5)
-// CHECK-NEXT:    ret i1 [[CALL]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Exp, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Des, align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
+// CHECK-NEXT:    br i1 [[TMP3]], label [[CMPXCHG_CONTINUE:%.*]], label [[CMPXCHG_STORE_EXPECTED:%.*]]
+// CHECK:       cmpxchg.store_expected:
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
+// CHECK-NEXT:    store i128 [[TMP4]], ptr @Exp, align 8
+// CHECK-NEXT:    br label [[CMPXCHG_CONTINUE]]
+// CHECK:       cmpxchg.continue:
+// CHECK-NEXT:    ret i1 [[TMP3]]
 //
 _Bool f8() {
   return __atomic_compare_exchange(&Ptr, &Exp, &Des, 0,
@@ -109,12 +124,8 @@ _Bool f8() {
 
 // CHECK-LABEL: @f9(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_add_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -125,12 +136,8 @@ __int128 f9() {
 
 // CHECK-LABEL: @f10(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_sub_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -141,12 +148,8 @@ __int128 f10() {
 
 // CHECK-LABEL: @f11(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_and_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -157,12 +160,8 @@ __int128 f11() {
 
 // CHECK-LABEL: @f12(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_xor_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -173,12 +172,8 @@ __int128 f12() {
 
 // CHECK-LABEL: @f13(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_or_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
@@ -189,12 +184,8 @@ __int128 f13() {
 
 // CHECK-LABEL: @f14(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_nand_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[TMP]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[TMP]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
 // CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
@@ -206,10 +197,9 @@ __int128 f14() {
 
 // CHECK-LABEL: @f15(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_add_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f15() {
@@ -218,10 +208,9 @@ __int128 f15() {
 
 // CHECK-LABEL: @f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_sub_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f16() {
@@ -230,10 +219,9 @@ __int128 f16() {
 
 // CHECK-LABEL: @f17(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_and_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f17() {
@@ -242,10 +230,9 @@ __int128 f17() {
 
 // CHECK-LABEL: @f18(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_xor_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f18() {
@@ -254,10 +241,9 @@ __int128 f18() {
 
 // CHECK-LABEL: @f19(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_or_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f19() {
@@ -266,10 +252,9 @@ __int128 f19() {
 
 // CHECK-LABEL: @f20(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca i128, align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[INDIRECT_ARG_TEMP]], align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    call void @__atomic_fetch_nand_16(ptr dead_on_unwind nonnull writable sret(i128) align 8 [[AGG_RESULT:%.*]], ptr noundef nonnull @Ptr, ptr noundef nonnull [[INDIRECT_ARG_TEMP]], i32 noundef signext 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT:%.*]], align 8, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 __int128 f20() {

diff  --git a/clang/test/CodeGen/arm-atomics-m.c b/clang/test/CodeGen/arm-atomics-m.c
index b9cc72bc6b98ab..6087fd9d6a66ae 100644
--- a/clang/test/CodeGen/arm-atomics-m.c
+++ b/clang/test/CodeGen/arm-atomics-m.c
@@ -22,14 +22,14 @@ void test_presence(void)
   r = 0;
   __atomic_store(&i, &r, memory_order_seq_cst);
 
-  // CHECK: __atomic_fetch_add_8
+  // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8
   __atomic_fetch_add(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_fetch_sub_8
+  // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8
   __atomic_fetch_sub(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_load_8
+  // CHECK: load atomic i64, ptr {{.*}} seq_cst, align 8
   long long rl;
   __atomic_load(&l, &rl, memory_order_seq_cst);
-  // CHECK: __atomic_store_8
+  // CHECK: store atomic i64 {{.*}}, ptr {{.*}} seq_cst, align 8
   rl = 0;
   __atomic_store(&l, &rl, memory_order_seq_cst);
 }

diff  --git a/clang/test/CodeGen/arm-atomics-m0.c b/clang/test/CodeGen/arm-atomics-m0.c
index 335a1d2711f808..94e344cf608df4 100644
--- a/clang/test/CodeGen/arm-atomics-m0.c
+++ b/clang/test/CodeGen/arm-atomics-m0.c
@@ -11,25 +11,25 @@ typedef enum memory_order {
 void test_presence(void)
 {
   // CHECK-LABEL: @test_presence
-  // CHECK: __atomic_fetch_add_4
+  // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
   __atomic_fetch_add(&i, 1, memory_order_seq_cst);
-  // CHECK: __atomic_fetch_sub_4
+  // CHECK: atomicrmw sub {{.*}} seq_cst, align 4
   __atomic_fetch_sub(&i, 1, memory_order_seq_cst);
-  // CHECK: __atomic_load_4
+  // CHECK: load atomic i32, ptr {{.*}} seq_cst, align 4
   int r;
   __atomic_load(&i, &r, memory_order_seq_cst);
-  // CHECK: __atomic_store_4
+  // CHECK: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   r = 0;
   __atomic_store(&i, &r, memory_order_seq_cst);
 
-  // CHECK: __atomic_fetch_add_8
+  // CHECK: atomicrmw add {{.*}} seq_cst, align 8
   __atomic_fetch_add(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_fetch_sub_8
+  // CHECK: atomicrmw sub {{.*}} seq_cst, align 8
   __atomic_fetch_sub(&l, 1, memory_order_seq_cst);
-  // CHECK: __atomic_load_8
+  // CHECK: load atomic i64, ptr {{.*}} seq_cst, align 8
   long long rl;
   __atomic_load(&l, &rl, memory_order_seq_cst);
-  // CHECK: __atomic_store_8
+  // CHECK: store atomic i64 {{.*}}, ptr {{.*}} seq_cst, align 8
   rl = 0;
   __atomic_store(&l, &rl, memory_order_seq_cst);
 }

diff  --git a/clang/test/CodeGen/atomic-ops-libcall.c b/clang/test/CodeGen/atomic-ops-libcall.c
index 745ccd22bf33f0..38a23f7236ce72 100644
--- a/clang/test/CodeGen/atomic-ops-libcall.c
+++ b/clang/test/CodeGen/atomic-ops-libcall.c
@@ -1,120 +1,338 @@
-// RUN: %clang_cc1 < %s -triple armv5e-none-linux-gnueabi -emit-llvm -O1 | FileCheck %s
-
-// FIXME: This file should not be checking -O1 output.
-// Ie, it is testing many IR optimizer passes as part of front-end verification.
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple armv5e-none-linux-gnueabi -emit-llvm %s -o - | FileCheck %s
 
 enum memory_order {
   memory_order_relaxed, memory_order_consume, memory_order_acquire,
   memory_order_release, memory_order_acq_rel, memory_order_seq_cst
 };
 
+// CHECK-LABEL: define dso_local ptr @test_c11_atomic_fetch_add_int_ptr(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 12, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret ptr [[TMP3]]
+//
 int *test_c11_atomic_fetch_add_int_ptr(_Atomic(int *) *p) {
-  // CHECK: test_c11_atomic_fetch_add_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 12, i32 noundef 5)
   return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local ptr @test_c11_atomic_fetch_sub_int_ptr(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 20, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret ptr [[TMP3]]
+//
 int *test_c11_atomic_fetch_sub_int_ptr(_Atomic(int *) *p) {
-  // CHECK: test_c11_atomic_fetch_sub_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 20, i32 noundef 5)
   return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_c11_atomic_fetch_add_int(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_c11_atomic_fetch_add_int(_Atomic(int) *p) {
-  // CHECK: test_c11_atomic_fetch_add_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 3, i32 noundef 5)
   return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_c11_atomic_fetch_sub_int(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 5, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_c11_atomic_fetch_sub_int(_Atomic(int) *p) {
-  // CHECK: test_c11_atomic_fetch_sub_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 5, i32 noundef 5)
   return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local ptr @fp2a(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 4, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret ptr [[TMP3]]
+//
 int *fp2a(int **p) {
-  // CHECK: @fp2a
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 4, i32 noundef 0)
   // Note, the GNU builtins do not multiply by sizeof(T)!
   return __atomic_fetch_sub(p, 4, memory_order_relaxed);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_add(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_add(int *p) {
-  // CHECK: test_atomic_fetch_add
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_add(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_sub(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_sub(int *p) {
-  // CHECK: test_atomic_fetch_sub
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_sub(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_and(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_and(int *p) {
-  // CHECK: test_atomic_fetch_and
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_and_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_and(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_or(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw or ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_or(int *p) {
-  // CHECK: test_atomic_fetch_or
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_or_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_or(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_xor(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xor ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_xor(int *p) {
-  // CHECK: test_atomic_fetch_xor
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_xor_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_xor(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_fetch_nand(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw nand ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 int test_atomic_fetch_nand(int *p) {
-  // CHECK: test_atomic_fetch_nand
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_nand_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
   return __atomic_fetch_nand(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_add_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_add_fetch(int *p) {
-  // CHECK: test_atomic_add_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_add_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = add i32 [[CALL]], 55
   return __atomic_add_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_sub_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw sub ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_sub_fetch(int *p) {
-  // CHECK: test_atomic_sub_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_sub_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = add i32 [[CALL]], -55
   return __atomic_sub_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_and_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_and_fetch(int *p) {
-  // CHECK: test_atomic_and_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_and_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = and i32 [[CALL]], 55
   return __atomic_and_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_or_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw or ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_or_fetch(int *p) {
-  // CHECK: test_atomic_or_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_or_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = or i32 [[CALL]], 55
   return __atomic_or_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_xor_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xor ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_atomic_xor_fetch(int *p) {
-  // CHECK: test_atomic_xor_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_xor_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // CHECK: {{%[^ ]*}} = xor i32 [[CALL]], 55
   return __atomic_xor_fetch(p, 55, memory_order_seq_cst);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_atomic_nand_fetch(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 4
+// CHECK-NEXT:    store i32 55, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw nand ptr [[TMP0]], i32 [[TMP1]] seq_cst, align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP3]], -1
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
 int test_atomic_nand_fetch(int *p) {
-  // CHECK: test_atomic_nand_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_nand_4(ptr noundef %p, i32 noundef 55, i32 noundef 5)
-  // FIXME: We should not be checking optimized IR. It changes independently of clang.
-  // FIXME-CHECK: [[AND:%[^ ]*]] = and i32 [[CALL]], 55
-  // FIXME-CHECK: {{%[^ ]*}} = xor i32 [[AND]], -1
   return __atomic_nand_fetch(p, 55, memory_order_seq_cst);
 }

diff  --git a/clang/test/CodeGen/atomic-ops.c b/clang/test/CodeGen/atomic-ops.c
index 9ac05d270b97c5..b6060dcc540f90 100644
--- a/clang/test/CodeGen/atomic-ops.c
+++ b/clang/test/CodeGen/atomic-ops.c
@@ -198,7 +198,8 @@ struct S implicit_load(_Atomic(struct S) *a) {
 struct S fd1(struct S *a) {
   // CHECK-LABEL: @fd1
   // CHECK: [[RETVAL:%.*]] = alloca %struct.S, align 4
-  // CHECK: call void @__atomic_load(i32 noundef 8, ptr noundef {{.*}}, ptr noundef [[RETVAL]], i32 noundef 5)
+  // CHECK: [[TMP1:%.*]] = load atomic i64, ptr {{%.*}} seq_cst, align 4
+  // CHECK-NEXT: store i64 [[TMP1]], ptr [[RETVAL]], align 4
   // CHECK: ret
   struct S ret;
   __atomic_load(a, &ret, memory_order_seq_cst);
@@ -213,7 +214,8 @@ void fd2(struct S *a, struct S *b) {
   // CHECK-NEXT: store ptr %b, ptr [[B_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_A_PTR:%.*]] = load ptr, ptr [[A_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-  // CHECK-NEXT: call void @__atomic_store(i32 noundef 8, ptr noundef [[LOAD_A_PTR]], ptr noundef [[LOAD_B_PTR]],
+  // CHECK-NEXT: [[LOAD_B:%.*]] = load i64, ptr [[LOAD_B_PTR]], align 4
+  // CHECK-NEXT: store atomic i64 [[LOAD_B]], ptr [[LOAD_A_PTR]] seq_cst, align 4
   // CHECK-NEXT: ret void
   __atomic_store(a, b, memory_order_seq_cst);
 }
@@ -229,7 +231,9 @@ void fd3(struct S *a, struct S *b, struct S *c) {
   // CHECK-NEXT: [[LOAD_A_PTR:%.*]] = load ptr, ptr [[A_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load ptr, ptr [[B_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_C_PTR:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-  // CHECK-NEXT: call void @__atomic_exchange(i32 noundef 8, ptr noundef [[LOAD_A_PTR]], ptr noundef [[LOAD_B_PTR]], ptr noundef [[LOAD_C_PTR]],
+  // CHECK-NEXT: [[LOAD_B:%.*]] = load i64, ptr [[LOAD_B_PTR]], align 4
+  // CHECK-NEXT: [[RESULT:%.*]] = atomicrmw xchg ptr [[LOAD_A_PTR]], i64 [[LOAD_B]] seq_cst, align 4
+  // CHECK-NEXT: store i64 [[RESULT]], ptr [[LOAD_C_PTR]], align 4
 
   __atomic_exchange(a, b, c, memory_order_seq_cst);
 }
@@ -245,8 +249,9 @@ _Bool fd4(struct S *a, struct S *b, struct S *c) {
   // CHECK-NEXT: [[LOAD_A_PTR:%.*]] = load ptr, ptr [[A_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_B_PTR:%.*]] = load ptr, ptr [[B_ADDR]], align 4
   // CHECK-NEXT: [[LOAD_C_PTR:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-  // CHECK-NEXT: [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 8, ptr noundef [[LOAD_A_PTR]], ptr noundef [[LOAD_B_PTR]], ptr noundef [[LOAD_C_PTR]],
-  // CHECK-NEXT: ret i1 [[CALL]]
+  // CHECK-NEXT: [[LOAD_B:%.*]] = load i64, ptr [[LOAD_B_PTR]], align 4
+  // CHECK-NEXT: [[LOAD_C:%.*]] = load i64, ptr [[LOAD_C_PTR]], align 4
+  // CHECK-NEXT: {{.*}} = cmpxchg weak ptr [[LOAD_A_PTR]], i64 [[LOAD_B]], i64 [[LOAD_C]] seq_cst seq_cst, align 4
   return __atomic_compare_exchange(a, b, c, 1, 5, 5);
 }
 
@@ -682,13 +687,13 @@ void test_underaligned(void) {
   // CHECK-LABEL: @test_underaligned
   struct Underaligned { char c[8]; } underaligned_a, underaligned_b, underaligned_c;
 
-  // CHECK: call void @__atomic_load(i32 noundef 8,
+  // CHECK: load atomic i64, {{.*}}, align 1
   __atomic_load(&underaligned_a, &underaligned_b, memory_order_seq_cst);
-  // CHECK: call void @__atomic_store(i32 noundef 8,
+  // CHECK: store atomic i64 {{.*}}, align 1
   __atomic_store(&underaligned_a, &underaligned_b, memory_order_seq_cst);
-  // CHECK: call void @__atomic_exchange(i32 noundef 8,
+  // CHECK: atomicrmw xchg ptr {{.*}}, align 1
   __atomic_exchange(&underaligned_a, &underaligned_b, &underaligned_c, memory_order_seq_cst);
-  // CHECK: call {{.*}} @__atomic_compare_exchange(i32 noundef 8,
+  // CHECK: cmpxchg weak ptr {{.*}}, align 1
   __atomic_compare_exchange(&underaligned_a, &underaligned_b, &underaligned_c, 1, memory_order_seq_cst, memory_order_seq_cst);
 
   __attribute__((aligned)) struct Underaligned aligned_a, aligned_b, aligned_c;
@@ -747,7 +752,7 @@ void test_minmax_postop(int *si, unsigned *ui, unsigned short *us, signed char *
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i32 [[OLD]], i32 [[RHS]]
   // CHECK: store i32 [[NEW]], ptr
   *si = __atomic_min_fetch(si, 42, memory_order_release);
-  
+
   // CHECK: [[OLD:%.*]] = atomicrmw umax ptr [[PTR:%.*]], i32 [[RHS:%.*]] release, align 4
   // CHECK: [[TST:%.*]] = icmp ugt i32 [[OLD]], [[RHS]]
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i32 [[OLD]], i32 [[RHS]]
@@ -772,7 +777,7 @@ void test_minmax_postop(int *si, unsigned *ui, unsigned short *us, signed char *
   // CHECK: store i8 [[NEW]], ptr
   *sc = __atomic_min_fetch(sc, 42, memory_order_release);
 
-  // CHECK: [[OLD:%.*]] = call i64 @__atomic_fetch_umin_8(ptr noundef {{%.*}}, i64 noundef [[RHS:%.*]],
+  // CHECK: [[OLD:%.*]] = atomicrmw umin ptr {{%.*}}, i64 [[RHS:%.*]] release, align 4
   // CHECK: [[TST:%.*]] = icmp ult i64 [[OLD]], [[RHS]]
   // CHECK: [[NEW:%.*]] = select i1 [[TST]], i64 [[OLD]], i64 [[RHS]]
   // CHECK: store i64 [[NEW]], ptr

diff  --git a/clang/test/CodeGen/atomics-inlining.c b/clang/test/CodeGen/atomics-inlining.c
index 862c63076b2dc0..217a294ee84abc 100644
--- a/clang/test/CodeGen/atomics-inlining.c
+++ b/clang/test/CodeGen/atomics-inlining.c
@@ -38,14 +38,14 @@ void test1(void) {
   (void)__atomic_store(&a1, &a2, memory_order_seq_cst);
 
 // ARM-LABEL: define{{.*}} void @test1
-// ARM: = call{{.*}} zeroext i8 @__atomic_load_1(ptr noundef @c1
-// ARM: call{{.*}} void @__atomic_store_1(ptr noundef @c1, i8 noundef zeroext
-// ARM: = call{{.*}} zeroext i16 @__atomic_load_2(ptr noundef @s1
-// ARM: call{{.*}} void @__atomic_store_2(ptr noundef @s1, i16 noundef zeroext
-// ARM: = call{{.*}} i32 @__atomic_load_4(ptr noundef @i1
-// ARM: call{{.*}} void @__atomic_store_4(ptr noundef @i1, i32 noundef
-// ARM: = call{{.*}} i64 @__atomic_load_8(ptr noundef @ll1
-// ARM: call{{.*}} void @__atomic_store_8(ptr noundef @ll1, i64 noundef
+// ARM: = load atomic i8, ptr @c1 seq_cst, align 1
+// ARM: store atomic i8 {{.*}}, ptr @c1 seq_cst, align 1
+// ARM: = load atomic i16, ptr @s1 seq_cst, align 2
+// ARM: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
+// ARM: = load atomic i32, ptr @i1 seq_cst, align 4
+// ARM: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
+// ARM: = load atomic i64, ptr @ll1 seq_cst, align 8
+// ARM: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // ARM: call{{.*}} void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // ARM: call{{.*}} void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 
@@ -56,8 +56,8 @@ void test1(void) {
 // PPC32: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
 // PPC32: = load atomic i32, ptr @i1 seq_cst, align 4
 // PPC32: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
-// PPC32: = call i64 @__atomic_load_8(ptr noundef @ll1
-// PPC32: call void @__atomic_store_8(ptr noundef @ll1, i64
+// PPC32: = load atomic i64, ptr @ll1 seq_cst, align 8
+// PPC32: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // PPC32: call void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // PPC32: call void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 
@@ -80,8 +80,8 @@ void test1(void) {
 // MIPS32: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
 // MIPS32: = load atomic i32, ptr @i1 seq_cst, align 4
 // MIPS32: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
-// MIPS32: call i64 @__atomic_load_8(ptr noundef @ll1
-// MIPS32: call void @__atomic_store_8(ptr noundef @ll1, i64
+// MIPS32: = load atomic i64, ptr @ll1 seq_cst, align 8
+// MIPS32: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // MIPS32: call void @__atomic_load(i32 noundef signext 100, ptr noundef @a1, ptr noundef @a2
 // MIPS32: call void @__atomic_store(i32 noundef signext 100, ptr noundef @a1, ptr noundef @a2
 
@@ -94,7 +94,7 @@ void test1(void) {
 // MIPS64: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
 // MIPS64: = load atomic i64, ptr @ll1 seq_cst, align 8
 // MIPS64: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
-// MIPS64: call void @__atomic_load(i64 noundef zeroext 100, ptr noundef @a1
+// MIPS64: call void @__atomic_load(i64 noundef zeroext 100, ptr noundef @a1, ptr noundef @a2
 // MIPS64: call void @__atomic_store(i64 noundef zeroext 100, ptr noundef @a1, ptr noundef @a2
 
 // SPARC-LABEL: define{{.*}} void @test1
@@ -104,12 +104,12 @@ void test1(void) {
 // SPARC: store atomic i16 {{.*}}, ptr @s1 seq_cst, align 2
 // SPARC: = load atomic i32, ptr @i1 seq_cst, align 4
 // SPARC: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
-// SPARCV8: call i64 @__atomic_load_8(ptr noundef @ll1
-// SPARCV8: call void @__atomic_store_8(ptr noundef @ll1, i64
-// SPARCV9: load atomic i64, ptr @ll1 seq_cst, align 8
-// SPARCV9: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
+// SPARC: load atomic i64, ptr @ll1 seq_cst, align 8
+// SPARC: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
 // SPARCV8: call void @__atomic_load(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
 // SPARCV8: call void @__atomic_store(i32 noundef 100, ptr noundef @a1, ptr noundef @a2
+// SPARCV9: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
+// SPARCV9: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
 
 // NVPTX-LABEL: define{{.*}} void @test1
 // NVPTX: = load atomic i8, ptr @c1 seq_cst, align 1
@@ -120,7 +120,7 @@ void test1(void) {
 // NVPTX: store atomic i32 {{.*}}, ptr @i1 seq_cst, align 4
 // NVPTX: = load atomic i64, ptr @ll1 seq_cst, align 8
 // NVPTX: store atomic i64 {{.*}}, ptr @ll1 seq_cst, align 8
-// NVPTX: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2, i32 noundef 5)
-// NVPTX: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2, i32 noundef 5)
+// NVPTX: call void @__atomic_load(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
+// NVPTX: call void @__atomic_store(i64 noundef 100, ptr noundef @a1, ptr noundef @a2
 
 }

diff  --git a/clang/test/CodeGen/c11atomics.c b/clang/test/CodeGen/c11atomics.c
index dd1f52f70ae09f..4da36ad4da0f92 100644
--- a/clang/test/CodeGen/c11atomics.c
+++ b/clang/test/CodeGen/c11atomics.c
@@ -343,10 +343,9 @@ PS test_promoted_load(_Atomic(PS) *addr) {
   // CHECK:   [[ATOMIC_RES:%.*]] = alloca { %struct.PS, [2 x i8] }, align 8
   // CHECK:   store ptr %addr, ptr [[ADDR_ARG]], align 4
   // CHECK:   [[ADDR:%.*]] = load ptr, ptr [[ADDR_ARG]], align 4
-  // CHECK:   [[RES:%.*]] = call arm_aapcscc i64 @__atomic_load_8(ptr noundef [[ADDR]], i32 noundef 5)
-  // CHECK:   store i64 [[RES]], ptr [[ATOMIC_RES]], align 8
-  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES]], i32 6, i1 false)
-
+  // CHECK:   [[ATOMIC_RES:%.*]] = load atomic i64, ptr [[ADDR]] seq_cst, align 8
+  // CHECK:   store i64 [[ATOMIC_RES]], ptr [[ATOMIC_RES_ADDR:%.*]], align 8
+  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES_ADDR]], i32 6, i1 false)
   return __c11_atomic_load(addr, 5);
 }
 
@@ -362,8 +361,8 @@ void test_promoted_store(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load ptr, ptr [[VAL_ARG]], align 4
   // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[NONATOMIC_TMP]], ptr align 2 [[VAL]], i32 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_VAL]], ptr align 2 [[NONATOMIC_TMP]], i64 6, i1 false)
-  // CHECK:   [[VAL64:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 2
-  // CHECK:   call arm_aapcscc void @__atomic_store_8(ptr noundef [[ADDR]], i64 noundef [[VAL64]], i32 noundef 5)
+  // CHECK:   [[ATOMIC:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 8
+  // CHECK:   store atomic i64 [[ATOMIC]], ptr [[ADDR]] seq_cst, align 8
   __c11_atomic_store(addr, *val, 5);
 }
 
@@ -380,10 +379,10 @@ PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) {
   // CHECK:   [[VAL:%.*]] = load ptr, ptr [[VAL_ARG]], align 4
   // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[NONATOMIC_TMP]], ptr align 2 [[VAL]], i32 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_VAL]], ptr align 2 [[NONATOMIC_TMP]], i64 6, i1 false)
-  // CHECK:   [[VAL64:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 2
-  // CHECK:   [[RES:%.*]] = call arm_aapcscc i64 @__atomic_exchange_8(ptr noundef [[ADDR]], i64 noundef [[VAL64]], i32 noundef 5)
-  // CHECK:   store i64 [[RES]], ptr [[ATOMIC_RES]], align 8
-  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES]], i32 6, i1 false)
+  // CHECK:   [[ATOMIC:%.*]] = load i64, ptr [[ATOMIC_VAL]], align 8
+  // CHECK:   [[ATOMIC_RES:%.*]] = atomicrmw xchg ptr [[ADDR]], i64 [[ATOMIC]] seq_cst, align 8
+  // CHECK:   store i64 [[ATOMIC_RES]], ptr [[ATOMIC_RES_PTR:%.*]], align 8
+  // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 %agg.result, ptr align 8 [[ATOMIC_RES_PTR]], i32 6, i1 false)
   return __c11_atomic_exchange(addr, *val, 5);
 }
 
@@ -404,9 +403,10 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[NONATOMIC_TMP]], ptr align 2 [[NEW]], i32 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_DESIRED]], ptr align 2 [[DESIRED]], i64 6, i1 false)
   // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_NEW]], ptr align 2 [[NONATOMIC_TMP]], i64 6, i1 false)
-  // CHECK:   [[NEW64:%.*]] = load i64, ptr [[ATOMIC_NEW]], align 2
-  // CHECK:   [[RES:%.*]] = call arm_aapcscc zeroext i1 @__atomic_compare_exchange_8(ptr noundef [[ADDR]], ptr noundef [[ATOMIC_DESIRED]], i64 noundef [[NEW64]], i32 noundef 5, i32 noundef 5)
-  // CHECK:   ret i1 [[RES]]
+  // CHECK:   [[VAL1:%.*]] = load i64, ptr [[ATOMIC_DESIRED]], align 8
+  // CHECK:   [[VAL2:%.*]] = load i64, ptr [[ATOMIC_NEW]], align 8
+  // CHECK:   [[RES_PAIR:%.*]] = cmpxchg ptr [[ADDR]], i64 [[VAL1]], i64 [[VAL2]] seq_cst seq_cst, align 8
+  // CHECK:   [[RES:%.*]] = extractvalue { i64, i1 } [[RES_PAIR]], 1
   return __c11_atomic_compare_exchange_strong(addr, desired, *new, 5, 5);
 }
 
@@ -414,12 +414,12 @@ struct Empty {};
 
 struct Empty test_empty_struct_load(_Atomic(struct Empty)* empty) {
   // CHECK-LABEL: @test_empty_struct_load(
-  // CHECK: call arm_aapcscc zeroext i8 @__atomic_load_1(ptr noundef %{{.*}}, i32 noundef 5)
+  // CHECK: load atomic i8, ptr {{.*}}, align 1
   return __c11_atomic_load(empty, 5);
 }
 
 void test_empty_struct_store(_Atomic(struct Empty)* empty, struct Empty value) {
   // CHECK-LABEL: @test_empty_struct_store(
-  // CHECK: call arm_aapcscc void @__atomic_store_1(ptr noundef %{{.*}}, i8 noundef zeroext %{{.*}}, i32 noundef 5)
+  // CHECK: store atomic i8 {{.*}}, ptr {{.*}}, align 1
   __c11_atomic_store(empty, value, 5);
 }

diff  --git a/clang/test/CodeGenCXX/atomic-inline.cpp b/clang/test/CodeGenCXX/atomic-inline.cpp
index 701bbd57b485c7..c8fa877a37beb5 100644
--- a/clang/test/CodeGenCXX/atomic-inline.cpp
+++ b/clang/test/CodeGenCXX/atomic-inline.cpp
@@ -42,7 +42,7 @@ AM16 m16;
 AM16 load16() {
   AM16 am;
   // CHECK-LABEL: @_Z6load16v
-  // CHECK: call void @__atomic_load
+  // CHECK: load atomic i128, {{.*}} monotonic, align 16
   // CORE2-LABEL: @_Z6load16v
   // CORE2: load atomic i128, {{.*}} monotonic, align 16
   __atomic_load(&m16, &am, 0);
@@ -52,7 +52,7 @@ AM16 load16() {
 AM16 s16;
 void store16() {
   // CHECK-LABEL: @_Z7store16v
-  // CHECK: call void @__atomic_store
+  // CHECK: store atomic i128 {{.*}} monotonic, align 16
   // CORE2-LABEL: @_Z7store16v
   // CORE2: store atomic i128 {{.*}} monotonic, align 16
   __atomic_store(&m16, &s16, 0);
@@ -61,7 +61,7 @@ void store16() {
 bool cmpxchg16() {
   AM16 am;
   // CHECK-LABEL: @_Z9cmpxchg16v
-  // CHECK: call noundef zeroext i1 @__atomic_compare_exchange
+  // CHECK: cmpxchg ptr {{.*}} monotonic monotonic, align 16
   // CORE2-LABEL: @_Z9cmpxchg16v
   // CORE2: cmpxchg ptr {{.*}} monotonic monotonic, align 16
   return __atomic_compare_exchange(&m16, &s16, &am, 0, 0, 0);

diff  --git a/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl b/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl
index 2f020c21082124..d615ff6bec4140 100644
--- a/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl
+++ b/clang/test/CodeGenOpenCL/atomic-ops-libcall.cl
@@ -20,63 +20,60 @@ typedef enum memory_scope {
 
 void f(atomic_int *i, global atomic_int *gi, local atomic_int *li, private atomic_int *pi, atomic_uint *ui, int cmp, int order, int scope) {
   int x;
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_load_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_load_4(ptr noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: load atomic i32, ptr addrspace(4) {{.*}} seq_cst, align 4
+  // ARM: load atomic i32, ptr {{.*}} seq_cst, align 4
   x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr addrspace(4) {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(i, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: %[[GP:[0-9]+]] = addrspacecast ptr addrspace(1) {{%[0-9]+}} to ptr addrspace(4)
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef %[[GP]], i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr addrspace(1) {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(gi, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: %[[GP:[0-9]+]] = addrspacecast ptr addrspace(3) {{%[0-9]+}} to ptr addrspace(4)
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef %[[GP]], i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr addrspace(3) {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(li, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: %[[GP:[0-9]+]] = addrspacecast ptr {{%[0-9]+}} to ptr addrspace(4)
-  // SPIR: call void @__opencl_atomic_store_4(ptr addrspace(4) noundef %[[GP]], i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: call void @__opencl_atomic_store_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
+  // ARM: store atomic i32 {{.*}}, ptr {{.*}} seq_cst, align 4
   __opencl_atomic_store(pi, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_add_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_add_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: atomicrmw add ptr addrspace(4) {{.*}}, i32 {{.*}} seq_cst, align 4
+  // ARM: atomicrmw add ptr {{.*}}, i32 {{.*}} seq_cst, align 4
   x = __opencl_atomic_fetch_add(i, 3, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_min_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_min_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: atomicrmw min ptr addrspace(4) {{.*}}, i32 {{.*}} seq_cst, align 4
+  // ARM: atomicrmw min ptr {{.*}}, i32 {{.*}} seq_cst, align 4
   x = __opencl_atomic_fetch_min(i, 3, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_umin_4(ptr addrspace(4) noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call i32 @__opencl_atomic_fetch_umin_4(ptr noundef {{%[0-9]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 1)
+  // SPIR: atomicrmw umin ptr addrspace(4) {{.*}}, i32 {{.*}} seq_cst, align 4
+  // ARM: atomicrmw umin ptr {{.*}}, i32 {{.*}} seq_cst, align 4
   x = __opencl_atomic_fetch_min(ui, 3, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
+  // SPIR: cmpxchg ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_strong(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 1)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 2)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 2)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 3)
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 3)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_all_svm_devices);
 
 #ifdef cl_khr_subgroups
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef 5, i32 noundef 5, i32 noundef 4)
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_sub_group);
 #endif
 
-  // SPIR: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr addrspace(4) noundef {{%[0-9]+}}, ptr addrspace(4) noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}})
-  // ARM: {{%[^ ]*}} = call zeroext i1 @__opencl_atomic_compare_exchange_4(ptr noundef {{%[0-9]+}}, ptr noundef {{%[^,]+}}, i32 noundef {{%[0-9]+}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}})
+  // SPIR: cmpxchg weak ptr addrspace(4) {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
+  // ARM: cmpxchg weak ptr {{.*}}, i32 {{.*}}, i32 {{.*}} seq_cst seq_cst, align 4
   x = __opencl_atomic_compare_exchange_weak(i, &cmp, 1, order, order, scope);
 }