[clang] [llvm] [NVPTX] Support __usAtomicCAS builtin (PR #99646)

Tue Aug 20 10:44:29 PDT 2024

================
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march nvptx64 --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_32 | FileCheck %s --check-prefixes=SM30,CHECK
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_32 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=SM70,CHECK
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+
+; TODO: these are system scope, but are compiled to gpu scope..
+; TODO: these are seq_cst, but are compiled to relaxed..
+
+; CHECK-LABEL: relaxed_sys_i8
+define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: relaxed_sys_i8(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT:    and.b32 %r10, %r9, 3;
+; SM30-NEXT:    shl.b32 %r1, %r10, 3;
+; SM30-NEXT:    mov.b32 %r11, 255;
+; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM30-NEXT:    not.b32 %r2, %r12;
+; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT:    and.b32 %r14, %r13, 255;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r17, %r20, %r3;
+; SM30-NEXT:    or.b32 %r18, %r20, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT:    @%p1 bra $L__BB0_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT:    mov.u32 %r20, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB0_1;
+; SM30-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM30-NEXT:    st.param.b32 [func_retval0+0], %r13;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: relaxed_sys_i8(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<17>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs9, [relaxed_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -2;
+; SM70-NEXT:    ld.param.u8 %rs10, [relaxed_sys_i8_param_1];
+; SM70-NEXT:    and.b64 %rd3, %rd2, 1;
+; SM70-NEXT:    shl.b64 %rd4, %rd3, 3;
+; SM70-NEXT:    cvt.u32.u64 %r1, %rd4;
+; SM70-NEXT:    mov.u16 %rs11, 255;
+; SM70-NEXT:    shl.b16 %rs12, %rs11, %r1;
+; SM70-NEXT:    not.b16 %rs2, %rs12;
+; SM70-NEXT:    shl.b16 %rs3, %rs9, %r1;
+; SM70-NEXT:    shl.b16 %rs4, %rs10, %r1;
+; SM70-NEXT:    ld.u16 %rs13, [%rd1];
+; SM70-NEXT:    and.b16 %rs16, %rs13, %rs2;
+; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b16 %rs14, %rs16, %rs3;
+; SM70-NEXT:    or.b16 %rs15, %rs16, %rs4;
+; SM70-NEXT:    atom.cas.b16 %rs7, [%rd1], %rs15, %rs14;
+; SM70-NEXT:    setp.eq.s16 %p1, %rs7, %rs15;
+; SM70-NEXT:    @%p1 bra $L__BB0_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM70-NEXT:    and.b16 %rs8, %rs7, %rs2;
+; SM70-NEXT:    setp.ne.s16 %p2, %rs16, %rs8;
+; SM70-NEXT:    mov.u16 %rs16, %rs8;
+; SM70-NEXT:    @%p2 bra $L__BB0_1;
+; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM70-NEXT:    cvt.u32.u16 %r2, %rs9;
+; SM70-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; SM70-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
----------------
Artem-B wrote:

> this is generating tests that I think are too precise and contain too much unrelated CHECKs,

They look reasonably readable to me. The cases where we have to fall-back to a more complicated algorithm are expected to be more verbose. 

Granted, not all the parts are particularly interesting, and would be an unnecessary burden if the checks were done manually, but with the autogenerated checks, it's not an issue. On a positive side, being able to see that some IR may produce less efficient code than others is sometimes quite useful as it highlights the current inefficiencies and possible future optimization opportunities.


https://github.com/llvm/llvm-project/pull/99646