[llvm] [NVPTX] Stop using 16-bit CAS instructions from PTX (PR #120220)
Akshay Deodhar via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 3 05:43:13 PST 2025
https://github.com/akshayrdeodhar updated https://github.com/llvm/llvm-project/pull/120220
>From 62118787378345d899cc0e01b1f4d297590f1b7e Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Mon, 25 Nov 2024 13:29:51 +0000
Subject: [PATCH 1/2] [NVPTX] Stop using 16-bit CAS instructions from PTX
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +-
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 2 +
llvm/test/CodeGen/NVPTX/atomics-sm70.ll | 8 +-
llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 128 +++++++++++++-------
llvm/test/CodeGen/NVPTX/cmpxchg.ll | 102 ++++++++++------
5 files changed, 155 insertions(+), 87 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b9003ddbd3187c..7650bbfdba4701 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -907,7 +907,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// actions
computeRegisterProperties(STI.getRegisterInfo());
- setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
+ setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
setMaxAtomicSizeInBitsSupported(64);
setMaxDivRemBitWidthSupported(64);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e785bbf830da62..a5dbf3f7438956 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -123,6 +123,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// set of equivalent memory operations with a scalar data-type, executed in
// an unspecified order on the elements in the vector.
unsigned getMaxRequiredAlignment() const { return 8; }
+ // Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS
+ unsigned getMinCmpXchgSizeInBits() const { return 32; }
unsigned getPTXVersion() const { return PTXVersion; }
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index 0c1ca8cb7ac166..a5b81dfc0cd009 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -132,10 +132,10 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end
; CHECKPTX62-NEXT: ret;
- %r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
- %r2 = atomicrmw fadd ptr %dp0, half 1.0 seq_cst
- %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val seq_cst
- %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val seq_cst
+ %r1 = atomicrmw fadd ptr %dp0, half %val monotonic
+ %r2 = atomicrmw fadd ptr %dp0, half 1.0 monotonic
+ %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val monotonic
+ %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val monotonic
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 8bae18dcc5eef8..16e7baced67838 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -46,65 +46,105 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-LABEL: test(
; CHECKPTX71: {
; CHECKPTX71-NEXT: .reg .pred %p<5>;
-; CHECKPTX71-NEXT: .reg .b16 %rs<34>;
-; CHECKPTX71-NEXT: .reg .b32 %r<4>;
+; CHECKPTX71-NEXT: .reg .b16 %rs<18>;
+; CHECKPTX71-NEXT: .reg .b32 %r<58>;
; CHECKPTX71-NEXT: .reg .f32 %f<12>;
; CHECKPTX71-EMPTY:
; CHECKPTX71-NEXT: // %bb.0:
-; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
-; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
-; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
-; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
-; CHECKPTX71-NEXT: ld.b16 %rs30, [%r1];
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13;
-; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start14
+; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
+; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
+; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
+; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
+; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
+; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
+; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
+; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
+; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
+; CHECKPTX71-NEXT: not.b32 %r3, %r27;
+; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs1;
+; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs30;
-; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3;
-; CHECKPTX71-NEXT: atom.cas.b16 %rs17, [%r1], %rs30, %rs14;
-; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs17, %rs30;
-; CHECKPTX71-NEXT: mov.u16 %rs30, %rs17;
+; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs2;
+; CHECKPTX71-NEXT: add.rn.f32 %f3, %f1, %f2;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs4, %f3;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
+; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
+; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
+; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
+; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
+; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
-; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end13
-; CHECKPTX71-NEXT: ld.b16 %rs31, [%r1];
-; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start8
+; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
+; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
+; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs31;
+; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs6, %r33;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs6;
; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs18, %f5;
-; CHECKPTX71-NEXT: atom.cas.b16 %rs21, [%r1], %rs31, %rs18;
-; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs21, %rs31;
-; CHECKPTX71-NEXT: mov.u16 %rs31, %rs21;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs8, %f5;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs8;
+; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
+; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
+; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
+; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
+; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
-; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end7
-; CHECKPTX71-NEXT: ld.global.b16 %rs32, [%r2];
-; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start2
+; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26
+; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
+; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
+; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
+; CHECKPTX71-NEXT: shl.b32 %r40, %r26, %r11;
+; CHECKPTX71-NEXT: not.b32 %r12, %r40;
+; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
+; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs32;
-; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs22, %f8;
-; CHECKPTX71-NEXT: atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22;
-; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs25, %rs32;
-; CHECKPTX71-NEXT: mov.u16 %rs32, %rs25;
+; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs10, %r41;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f6, %rs10;
+; CHECKPTX71-NEXT: add.rn.f32 %f8, %f6, %f2;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs12, %f8;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs12;
+; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
+; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
+; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
+; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
+; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
-; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end1
-; CHECKPTX71-NEXT: ld.shared.b16 %rs33, [%r3];
+; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8
+; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
+; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
+; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
+; CHECKPTX71-NEXT: shl.b32 %r48, %r26, %r17;
+; CHECKPTX71-NEXT: not.b32 %r18, %r48;
+; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs33;
-; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs26, %f11;
-; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26;
-; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs29, %rs33;
-; CHECKPTX71-NEXT: mov.u16 %rs33, %rs29;
+; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs14, %r49;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f9, %rs14;
+; CHECKPTX71-NEXT: add.rn.f32 %f11, %f9, %f2;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f11;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs16;
+; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
+; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
+; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
+; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
+; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
; CHECKPTX71-NEXT: ret;
- %r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
- %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
- %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst
- %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val seq_cst
+ %r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic
+ %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic
+ %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic
+ %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val monotonic
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index f7cc32b962b9c8..dd4bd078ee8ccf 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -53,43 +53,44 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-LABEL: relaxed_sys_i8(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
-; SM70-NEXT: .reg .b16 %rs<17>;
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs9, [relaxed_sys_i8_param_2];
+; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
-; SM70-NEXT: and.b64 %rd1, %rd2, -2;
-; SM70-NEXT: ld.param.u8 %rs10, [relaxed_sys_i8_param_1];
-; SM70-NEXT: and.b64 %rd3, %rd2, 1;
-; SM70-NEXT: shl.b64 %rd4, %rd3, 3;
-; SM70-NEXT: cvt.u32.u64 %r1, %rd4;
-; SM70-NEXT: mov.u16 %rs11, 255;
-; SM70-NEXT: shl.b16 %rs12, %rs11, %r1;
-; SM70-NEXT: not.b16 %rs2, %rs12;
-; SM70-NEXT: shl.b16 %rs3, %rs9, %r1;
-; SM70-NEXT: shl.b16 %rs4, %rs10, %r1;
-; SM70-NEXT: ld.u16 %rs13, [%rd1];
-; SM70-NEXT: and.b16 %rs16, %rs13, %rs2;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b16 %rs14, %rs16, %rs3;
-; SM70-NEXT: or.b16 %rs15, %rs16, %rs4;
-; SM70-NEXT: atom.cas.b16 %rs7, [%rd1], %rs15, %rs14;
-; SM70-NEXT: setp.eq.s16 %p1, %rs7, %rs15;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB0_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1
-; SM70-NEXT: and.b16 %rs8, %rs7, %rs2;
-; SM70-NEXT: setp.ne.s16 %p2, %rs16, %rs8;
-; SM70-NEXT: mov.u16 %rs16, %rs8;
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB0_1;
; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT: cvt.u32.u16 %r2, %rs9;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
ret i8 %new
}
@@ -137,19 +138,44 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
;
; SM70-LABEL: relaxed_sys_i16(
; SM70: {
-; SM70-NEXT: .reg .b16 %rs<4>;
-; SM70-NEXT: .reg .b32 %r<2>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i16_param_0];
-; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_1];
-; SM70-NEXT: ld.param.u16 %rs2, [relaxed_sys_i16_param_2];
-; SM70-NEXT: atom.cas.b16 %rs3, [%rd1], %rs1, %rs2;
-; SM70-NEXT: cvt.u32.u16 %r1, %rs2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB1_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB1_1;
+; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
ret i16 %new
}
@@ -180,7 +206,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
; SM70-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
ret i32 %new
}
@@ -209,7 +235,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
; SM70-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
ret i64 %new
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
>From 17fec130f0f62e035cca8551f60ed2b0e8087035 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Fri, 3 Jan 2025 13:42:52 +0000
Subject: [PATCH 2/2] update tests
---
llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 36 +++++++++++++------------
1 file changed, 19 insertions(+), 17 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 7bc5d96dd500e8..5b999a9c9ecde8 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -46,7 +46,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-LABEL: test(
; CHECKPTX71: {
; CHECKPTX71-NEXT: .reg .pred %p<5>;
-; CHECKPTX71-NEXT: .reg .b16 %rs<18>;
+; CHECKPTX71-NEXT: .reg .b16 %rs<10>;
; CHECKPTX71-NEXT: .reg .b32 %r<58>;
; CHECKPTX71-NEXT: .reg .f32 %f<12>;
; CHECKPTX71-EMPTY:
@@ -69,8 +69,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs2;
; CHECKPTX71-NEXT: add.rn.f32 %f3, %f1, %f2;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs4, %f3;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs3, %f3;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs3;
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
@@ -83,11 +83,11 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs6, %r33;
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs6;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs4, %r33;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs4;
; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs8, %f5;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs8;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs5, %f5;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs5;
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
@@ -99,17 +99,18 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
-; CHECKPTX71-NEXT: shl.b32 %r40, %r26, %r11;
+; CHECKPTX71-NEXT: mov.b32 %r39, 65535;
+; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11;
; CHECKPTX71-NEXT: not.b32 %r12, %r40;
; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs10, %r41;
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f6, %rs10;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs6, %r41;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f6, %rs6;
; CHECKPTX71-NEXT: add.rn.f32 %f8, %f6, %f2;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs12, %f8;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs12;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs7, %f8;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs7;
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
@@ -121,17 +122,18 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
-; CHECKPTX71-NEXT: shl.b32 %r48, %r26, %r17;
+; CHECKPTX71-NEXT: mov.b32 %r47, 65535;
+; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17;
; CHECKPTX71-NEXT: not.b32 %r18, %r48;
; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs14, %r49;
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f9, %rs14;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r49;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f9, %rs8;
; CHECKPTX71-NEXT: add.rn.f32 %f11, %f9, %f2;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f11;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs16;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs9, %f11;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs9;
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
More information about the llvm-commits
mailing list