[llvm] 2d1fba6 - [NVPTX] Support __usAtomicCAS builtin (#99646)

via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 28 12:01:39 PDT 2024


Author: Denis.G
Date: 2024-08-28T12:01:35-07:00
New Revision: 2d1fba67c54b7c6b7f57635f863d4e1cd5fcc048

URL: https://github.com/llvm/llvm-project/commit/2d1fba67c54b7c6b7f57635f863d4e1cd5fcc048
DIFF: https://github.com/llvm/llvm-project/commit/2d1fba67c54b7c6b7f57635f863d4e1cd5fcc048.diff

LOG: [NVPTX] Support __usAtomicCAS builtin (#99646)

Supported `__usAtomicCAS` builtin originally defined in
`/usr/local/cuda/inlcude/crt/sm_70_rt.hpp`

---------

Co-authored-by: Denis Gerasimov <Denis.Gerasimov at baikalelectronics.ru>
Co-authored-by: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Co-authored-by: Denis.Gerasimov <dengzmm at gmail.com>

Added: 
    llvm/test/CodeGen/NVPTX/cmpxchg.ll

Modified: 
    clang/include/clang/Basic/BuiltinsNVPTX.def
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/Headers/__clang_cuda_device_functions.h
    clang/test/CodeGen/builtins-nvptx.c
    llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
    llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
    llvm/lib/Target/NVPTX/NVPTXSubtarget.h
    llvm/test/CodeGen/NVPTX/atomics-sm90.ll
    llvm/utils/UpdateTestChecks/common.py

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index c11970c279c4bb..20f038a0a9bbde 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -844,6 +844,9 @@ BUILTIN(__nvvm_atom_xor_gen_ll, "LLiLLiD*LLi", "n")
 TARGET_BUILTIN(__nvvm_atom_cta_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)
 TARGET_BUILTIN(__nvvm_atom_sys_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)
 
+TARGET_BUILTIN(__nvvm_atom_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
+TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
+TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
 BUILTIN(__nvvm_atom_cas_gen_i, "iiD*ii", "n")
 TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_i, "iiD*ii", "n", SM_60)
 TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_i, "iiD*ii", "n", SM_60)

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2a733e4d834cfa..4204c8ff276ab1 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20344,6 +20344,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_atom_min_gen_ull:
     return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
 
+  case NVPTX::BI__nvvm_atom_cas_gen_us:
   case NVPTX::BI__nvvm_atom_cas_gen_i:
   case NVPTX::BI__nvvm_atom_cas_gen_l:
   case NVPTX::BI__nvvm_atom_cas_gen_ll:
@@ -20535,6 +20536,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
   case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
     return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
+  case NVPTX::BI__nvvm_atom_cta_cas_gen_us:
   case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
   case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
   case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
@@ -20546,6 +20548,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
             Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
         {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
   }
+  case NVPTX::BI__nvvm_atom_sys_cas_gen_us:
   case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
   case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
   case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {

diff  --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h
index f8a12cefdb81b4..86123727a1bc3f 100644
--- a/clang/lib/Headers/__clang_cuda_device_functions.h
+++ b/clang/lib/Headers/__clang_cuda_device_functions.h
@@ -529,6 +529,20 @@ __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
 __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
 __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
 __DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
+__DEVICE__ unsigned short
+__usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) {
+  return __nvvm_atom_cas_gen_us(__p, __cmp, __v);
+}
+__DEVICE__ unsigned short __usAtomicCAS_block(unsigned short *__p,
+                                              unsigned short __cmp,
+                                              unsigned short __v) {
+  return __nvvm_atom_cta_cas_gen_us(__p, __cmp, __v);
+}
+__DEVICE__ unsigned short __usAtomicCAS_system(unsigned short *__p,
+                                               unsigned short __cmp,
+                                               unsigned short __v) {
+  return __nvvm_atom_sys_cas_gen_us(__p, __cmp, __v);
+}
 __DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
   return __nvvm_atom_add_gen_i((int *)__p, __v);
 }

diff  --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 20399b73e63757..bfa72e8bd69454 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1,4 +1,7 @@
 // REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
+// RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
@@ -238,7 +241,8 @@ __shared__ long long sll;
 
 // Check for atomic intrinsics
 // CHECK-LABEL: nvvm_atom
-__device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
+__device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
+                          unsigned short *usp, unsigned short us, int *ip,
                           int i, unsigned int *uip, unsigned ui, long *lp,
                           long l, long long *llp, long long ll) {
   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
@@ -577,6 +581,16 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
   __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
 #endif
 
+#if __CUDA_ARCH__ >= 700
+  // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2
+  // CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0
+  __nvvm_atom_cas_gen_us(usp, 0, us);
+  // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0
+  __nvvm_atom_cta_cas_gen_us(usp, 0, us);
+  // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.sys.i16.p0
+  __nvvm_atom_sys_cas_gen_us(usp, 0, us);
+#endif
+
   // CHECK: ret
 }
 

diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 73791102fc04de..3fcae6b0e2875b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -890,7 +890,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // actions
   computeRegisterProperties(STI.getRegisterInfo());
 
-  setMinCmpXchgSizeInBits(32);
+  setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
   setMaxAtomicSizeInBitsSupported(64);
   setMaxDivRemBitWidthSupported(64);
 }

diff  --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 371b4c21353331..0c883093dd0a54 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2076,6 +2076,12 @@ defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64
 
 // atom_cas
 
+def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
 def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
   (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
 def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
@@ -2089,6 +2095,14 @@ def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
 def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
   (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
 
+defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
+  atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
+  atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
+  atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
+  atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
 defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
   atomic_cmp_swap_i32_g, i32imm>;
 defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
@@ -2300,6 +2314,7 @@ multiclass ATOM2_incdec_impl<string OpStr> {
 
 // atom.cas
 multiclass ATOM3_cas_impl<string OpStr> {
+   defm _b16  : ATOM3S_impl<OpStr, "i", "b16", i16, Int16Regs, i16imm, imm, i16, []>;
    defm _b32  : ATOM3S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _b64  : ATOM3S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, []>;
 }

diff  --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e47050734aae1e..0591782e8148b9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -77,6 +77,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasAtomScope() const { return SmVersion >= 60; }
   bool hasAtomBitwise64() const { return SmVersion >= 32; }
   bool hasAtomMinMax64() const { return SmVersion >= 32; }
+  bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
   bool hasLDG() const { return SmVersion >= 32; }
   bool hasHWROT32() const { return SmVersion >= 32; }
   bool hasImageHandles() const;

diff  --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 9301ea44c69367..d69dd8ad1c9405 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -45,102 +45,62 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ;
 ; CHECKPTX71-LABEL: test(
 ; CHECKPTX71:       {
-; CHECKPTX71-NEXT:    .reg .pred %p<5>;
-; CHECKPTX71-NEXT:    .reg .b16 %rs<18>;
-; CHECKPTX71-NEXT:    .reg .b32 %r<58>;
-; CHECKPTX71-NEXT:    .reg .f32 %f<12>;
+; CHECKPTX71-NEXT:  	.reg .pred 	%p<5>;
+; CHECKPTX71-NEXT:  	.reg .b16 	%rs<34>;
+; CHECKPTX71-NEXT:  	.reg .b32 	%r<4>;
+; CHECKPTX71-NEXT:  	.reg .f32 	%f<12>;
 ; CHECKPTX71-EMPTY:
 ; CHECKPTX71-NEXT:  // %bb.0:
-; CHECKPTX71-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX71-NEXT:    ld.param.u32 %r23, [test_param_2];
-; CHECKPTX71-NEXT:    ld.param.u32 %r22, [test_param_1];
-; CHECKPTX71-NEXT:    ld.param.u32 %r24, [test_param_0];
-; CHECKPTX71-NEXT:    and.b32 %r1, %r24, -4;
-; CHECKPTX71-NEXT:    and.b32 %r25, %r24, 3;
-; CHECKPTX71-NEXT:    shl.b32 %r2, %r25, 3;
-; CHECKPTX71-NEXT:    mov.b32 %r26, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
-; CHECKPTX71-NEXT:    not.b32 %r3, %r27;
-; CHECKPTX71-NEXT:    ld.u32 %r54, [%r1];
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f2, %rs1;
-; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start
+; CHECKPTX71-NEXT:  	ld.param.b16 	%rs13, [test_param_3];
+; CHECKPTX71-NEXT:  	ld.param.u32 	%r3, [test_param_2];
+; CHECKPTX71-NEXT:  	ld.param.u32 	%r2, [test_param_1];
+; CHECKPTX71-NEXT:  	ld.param.u32 	%r1, [test_param_0];
+; CHECKPTX71-NEXT:  	ld.b16 	%rs30, [%r1];
+; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f1, %rs13;
+; CHECKPTX71-NEXT:  $L__BB0_1:                              // %atomicrmw.start
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r28, %r54, %r2;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r28;
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f1, %rs2;
-; CHECKPTX71-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs4, %f3;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r29, %rs4;
-; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
-; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT:    atom.cas.b32 %r6, [%r1], %r54, %r32;
-; CHECKPTX71-NEXT:    setp.ne.s32 %p1, %r6, %r54;
-; CHECKPTX71-NEXT:    mov.u32 %r54, %r6;
-; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
-; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end
-; CHECKPTX71-NEXT:    ld.u32 %r55, [%r1];
-; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start9
+; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f2, %rs30;
+; CHECKPTX71-NEXT:  	add.rn.f32 	%f3, %f2, %f1;
+; CHECKPTX71-NEXT:  	cvt.rn.bf16.f32 	%rs14, %f3;
+; CHECKPTX71-NEXT:  	atom.cas.b16 	%rs17, [%r1], %rs30, %rs14;
+; CHECKPTX71-NEXT:  	setp.ne.s16 	%p1, %rs17, %rs30;
+; CHECKPTX71-NEXT:  	mov.u16 	%rs30, %rs17;
+; CHECKPTX71-NEXT:  	@%p1 bra 	$L__BB0_1;
+; CHECKPTX71-NEXT:  // %bb.2:                               // %atomicrmw.end
+; CHECKPTX71-NEXT:  	ld.b16 	%rs31, [%r1];
+; CHECKPTX71-NEXT:  $L__BB0_3:                              // %atomicrmw.start2
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r33, %r55, %r2;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs6, %r33;
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f4, %rs6;
-; CHECKPTX71-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs8;
-; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
-; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT:    atom.cas.b32 %r9, [%r1], %r55, %r37;
-; CHECKPTX71-NEXT:    setp.ne.s32 %p2, %r9, %r55;
-; CHECKPTX71-NEXT:    mov.u32 %r55, %r9;
-; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
-; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end8
-; CHECKPTX71-NEXT:    and.b32 %r10, %r22, -4;
-; CHECKPTX71-NEXT:    shl.b32 %r38, %r22, 3;
-; CHECKPTX71-NEXT:    and.b32 %r11, %r38, 24;
-; CHECKPTX71-NEXT:    shl.b32 %r40, %r26, %r11;
-; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
-; CHECKPTX71-NEXT:    ld.global.u32 %r56, [%r10];
-; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start27
+; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f4, %rs31;
+; CHECKPTX71-NEXT:  	add.rn.f32 	%f5, %f4, 0f3F800000;
+; CHECKPTX71-NEXT:  	cvt.rn.bf16.f32 	%rs18, %f5;
+; CHECKPTX71-NEXT:  	atom.cas.b16 	%rs21, [%r1], %rs31, %rs18;
+; CHECKPTX71-NEXT:  	setp.ne.s16 	%p2, %rs21, %rs31;
+; CHECKPTX71-NEXT:  	mov.u16 	%rs31, %rs21;
+; CHECKPTX71-NEXT:  	@%p2 bra 	$L__BB0_3;
+; CHECKPTX71-NEXT:  // %bb.4:                               // %atomicrmw.end1
+; CHECKPTX71-NEXT:  	ld.global.b16 	%rs32, [%r2];
+; CHECKPTX71-NEXT:  $L__BB0_5:                              // %atomicrmw.start8
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r41, %r56, %r11;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs10, %r41;
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f6, %rs10;
-; CHECKPTX71-NEXT:    add.rn.f32 %f8, %f6, %f2;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs12, %f8;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs12;
-; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
-; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
-; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT:    atom.global.cas.b32 %r15, [%r10], %r56, %r45;
-; CHECKPTX71-NEXT:    setp.ne.s32 %p3, %r15, %r56;
-; CHECKPTX71-NEXT:    mov.u32 %r56, %r15;
-; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
-; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end26
-; CHECKPTX71-NEXT:    and.b32 %r16, %r23, -4;
-; CHECKPTX71-NEXT:    shl.b32 %r46, %r23, 3;
-; CHECKPTX71-NEXT:    and.b32 %r17, %r46, 24;
-; CHECKPTX71-NEXT:    shl.b32 %r48, %r26, %r17;
-; CHECKPTX71-NEXT:    not.b32 %r18, %r48;
-; CHECKPTX71-NEXT:    ld.shared.u32 %r57, [%r16];
-; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start45
+; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f7, %rs32;
+; CHECKPTX71-NEXT:  	add.rn.f32 	%f8, %f7, %f1;
+; CHECKPTX71-NEXT:  	cvt.rn.bf16.f32 	%rs22, %f8;
+; CHECKPTX71-NEXT:  	atom.global.cas.b16 	%rs25, [%r2], %rs32, %rs22;
+; CHECKPTX71-NEXT:  	setp.ne.s16 	%p3, %rs25, %rs32;
+; CHECKPTX71-NEXT:  	mov.u16 	%rs32, %rs25;
+; CHECKPTX71-NEXT:  	@%p3 bra 	$L__BB0_5;
+; CHECKPTX71-NEXT:  // %bb.6:                               // %atomicrmw.end7
+; CHECKPTX71-NEXT:  	ld.shared.b16 	%rs33, [%r3];
+; CHECKPTX71-NEXT:  $L__BB0_7:                              // %atomicrmw.start14
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r49, %r57, %r17;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs14, %r49;
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f9, %rs14;
-; CHECKPTX71-NEXT:    add.rn.f32 %f11, %f9, %f2;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs16, %f11;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r50, %rs16;
-; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
-; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
-; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT:    atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
-; CHECKPTX71-NEXT:    setp.ne.s32 %p4, %r21, %r57;
-; CHECKPTX71-NEXT:    mov.u32 %r57, %r21;
-; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
-; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end44
-; CHECKPTX71-NEXT:    ret;
+; CHECKPTX71-NEXT:  	cvt.f32.bf16 	%f10, %rs33;
+; CHECKPTX71-NEXT:  	add.rn.f32 	%f11, %f10, %f1;
+; CHECKPTX71-NEXT:  	cvt.rn.bf16.f32 	%rs26, %f11;
+; CHECKPTX71-NEXT:  	atom.shared.cas.b16 	%rs29, [%r3], %rs33, %rs26;
+; CHECKPTX71-NEXT:  	setp.ne.s16 	%p4, %rs29, %rs33;
+; CHECKPTX71-NEXT:  	mov.u16 	%rs33, %rs29;
+; CHECKPTX71-NEXT:  	@%p4 bra 	$L__BB0_7;
+; CHECKPTX71-NEXT:  // %bb.8:                               // %atomicrmw.end13
+; CHECKPTX71-NEXT:  	ret;
   %r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
   %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
   %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst

diff  --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
new file mode 100644
index 00000000000000..85ae5f0c8f6013
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march nvptx64 --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s --check-prefixes=SM30,CHECK
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_32 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=SM70,CHECK
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+
+; TODO: these are system scope, but are compiled to gpu scope..
+; TODO: these are seq_cst, but are compiled to relaxed..
+
+; CHECK-LABEL: relaxed_sys_i8
+define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: relaxed_sys_i8(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT:    and.b32 %r10, %r9, 3;
+; SM30-NEXT:    shl.b32 %r1, %r10, 3;
+; SM30-NEXT:    mov.b32 %r11, 255;
+; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM30-NEXT:    not.b32 %r2, %r12;
+; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT:    and.b32 %r14, %r13, 255;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r17, %r20, %r3;
+; SM30-NEXT:    or.b32 %r18, %r20, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT:    @%p1 bra $L__BB0_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT:    mov.u32 %r20, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB0_1;
+; SM30-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM30-NEXT:    st.param.b32 [func_retval0+0], %r13;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: relaxed_sys_i8(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<17>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs9, [relaxed_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -2;
+; SM70-NEXT:    ld.param.u8 %rs10, [relaxed_sys_i8_param_1];
+; SM70-NEXT:    and.b64 %rd3, %rd2, 1;
+; SM70-NEXT:    shl.b64 %rd4, %rd3, 3;
+; SM70-NEXT:    cvt.u32.u64 %r1, %rd4;
+; SM70-NEXT:    mov.u16 %rs11, 255;
+; SM70-NEXT:    shl.b16 %rs12, %rs11, %r1;
+; SM70-NEXT:    not.b16 %rs2, %rs12;
+; SM70-NEXT:    shl.b16 %rs3, %rs9, %r1;
+; SM70-NEXT:    shl.b16 %rs4, %rs10, %r1;
+; SM70-NEXT:    ld.u16 %rs13, [%rd1];
+; SM70-NEXT:    and.b16 %rs16, %rs13, %rs2;
+; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b16 %rs14, %rs16, %rs3;
+; SM70-NEXT:    or.b16 %rs15, %rs16, %rs4;
+; SM70-NEXT:    atom.cas.b16 %rs7, [%rd1], %rs15, %rs14;
+; SM70-NEXT:    setp.eq.s16 %p1, %rs7, %rs15;
+; SM70-NEXT:    @%p1 bra $L__BB0_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM70-NEXT:    and.b16 %rs8, %rs7, %rs2;
+; SM70-NEXT:    setp.ne.s16 %p2, %rs16, %rs8;
+; SM70-NEXT:    mov.u16 %rs16, %rs8;
+; SM70-NEXT:    @%p2 bra $L__BB0_1;
+; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM70-NEXT:    cvt.u32.u16 %r2, %rs9;
+; SM70-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; SM70-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+  ret i8 %new
+}
+
+; CHECK-LABEL: relaxed_sys_i16
+define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: relaxed_sys_i16(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 65535;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r16, %r19, %r3;
+; SM30-NEXT:    or.b32 %r17, %r19, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM30-NEXT:    @%p1 bra $L__BB1_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM30-NEXT:    mov.u32 %r19, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB1_1;
+; SM30-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM30-NEXT:    st.param.b32 [func_retval0+0], %r14;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: relaxed_sys_i16(
+; SM70:       {
+; SM70-NEXT:    .reg .b16 %rs<4>;
+; SM70-NEXT:    .reg .b32 %r<2>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i16_param_0];
+; SM70-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_1];
+; SM70-NEXT:    ld.param.u16 %rs2, [relaxed_sys_i16_param_2];
+; SM70-NEXT:    atom.cas.b16 %rs3, [%rd1], %rs1, %rs2;
+; SM70-NEXT:    cvt.u32.u16 %r1, %rs2;
+; SM70-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; SM70-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+  ret i16 %new
+}
+
+; CHECK-LABEL: relaxed_sys_i32
+define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
+; SM30-LABEL: relaxed_sys_i32(
+; SM30:       {
+; SM30-NEXT:    .reg .b32 %r<4>;
+; SM30-NEXT:    .reg .b64 %rd<2>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
+; SM30-NEXT:    ld.param.u32 %r1, [relaxed_sys_i32_param_1];
+; SM30-NEXT:    ld.param.u32 %r2, [relaxed_sys_i32_param_2];
+; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM30-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: relaxed_sys_i32(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [relaxed_sys_i32_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [relaxed_sys_i32_param_2];
+; SM70-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; SM70-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+  ret i32 %new
+}
+
+; CHECK-LABEL: relaxed_sys_i64
+define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
+; SM30-LABEL: relaxed_sys_i64(
+; SM30:       {
+; SM30-NEXT:    .reg .b64 %rd<5>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
+; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
+; SM30-NEXT:    ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
+; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM30-NEXT:    st.param.b64 [func_retval0+0], %rd3;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: relaxed_sys_i64(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
+; SM70-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0+0], %rd3;
+; SM70-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+  ret i64 %new
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

diff  --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index eb212ed304e9db..c5e4ad4219c91d 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -636,6 +636,7 @@ def get_triple_from_march(march):
         "amdgcn": "amdgcn",
         "r600": "r600",
         "mips": "mips",
+        "nvptx64": "nvptx64",
         "sparc": "sparc",
         "hexagon": "hexagon",
         "ve": "ve",


        


More information about the llvm-commits mailing list