[llvm] [NVPTX] Stop using 16-bit CAS instructions from PTX (PR #120220)

Fri Jan 3 01:36:19 PST 2025

================
@@ -53,43 +53,44 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-LABEL: relaxed_sys_i8(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<17>;
-; SM70-NEXT:    .reg .b32 %r<3>;
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs9, [relaxed_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -2;
-; SM70-NEXT:    ld.param.u8 %rs10, [relaxed_sys_i8_param_1];
-; SM70-NEXT:    and.b64 %rd3, %rd2, 1;
-; SM70-NEXT:    shl.b64 %rd4, %rd3, 3;
-; SM70-NEXT:    cvt.u32.u64 %r1, %rd4;
-; SM70-NEXT:    mov.u16 %rs11, 255;
-; SM70-NEXT:    shl.b16 %rs12, %rs11, %r1;
-; SM70-NEXT:    not.b16 %rs2, %rs12;
-; SM70-NEXT:    shl.b16 %rs3, %rs9, %r1;
-; SM70-NEXT:    shl.b16 %rs4, %rs10, %r1;
-; SM70-NEXT:    ld.u16 %rs13, [%rd1];
-; SM70-NEXT:    and.b16 %rs16, %rs13, %rs2;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b16 %rs14, %rs16, %rs3;
-; SM70-NEXT:    or.b16 %rs15, %rs16, %rs4;
-; SM70-NEXT:    atom.cas.b16 %rs7, [%rd1], %rs15, %rs14;
-; SM70-NEXT:    setp.eq.s16 %p1, %rs7, %rs15;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB0_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; SM70-NEXT:    and.b16 %rs8, %rs7, %rs2;
-; SM70-NEXT:    setp.ne.s16 %p2, %rs16, %rs8;
-; SM70-NEXT:    mov.u16 %rs16, %rs8;
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB0_1;
 ; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT:    cvt.u32.u16 %r2, %rs9;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
----------------
akshayrdeodhar wrote:

The original test was incorrect. The code being generated was *not* sequentially consistent. AtomicExpand (as of now) does not take memory ordering into account, and generates code for monotonic/relaxed semantics. This change makes the IR consistent with the codegen.

https://github.com/llvm/llvm-project/pull/120220