[llvm] [NVPTX] simplified atomics-sm90.ll test (PR #125968)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 5 16:09:13 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Artem Belevich (Artem-B)
<details>
<summary>Changes</summary>
There's no point running 32-bit ptxas tests for new GPUs. New CUDA versions no longer support 32-bit compilation.
Also set correct version constraint on ptxas so the test does not fail with older CUDA versions.
---
Full diff: https://github.com/llvm/llvm-project/pull/125968.diff
1 Files Affected:
- (modified) llvm/test/CodeGen/NVPTX/atomics-sm90.ll (+118-133)
``````````diff
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 67abfe8295a623..139da9befc4b78 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -1,142 +1,127 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK64
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefixes=CHECKPTX71
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefix=PTX78
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefix=PTX71
+; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-11.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %}
target triple = "nvptx64-nvidia-cuda"
define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat %val) {
-; CHECK-LABEL: test(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<7>;
-; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u32 %r1, [test_param_0];
-; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3];
-; CHECK-NEXT: atom.add.noftz.bf16 %rs2, [%r1], %rs1;
-; CHECK-NEXT: ld.param.u32 %r2, [test_param_1];
-; CHECK-NEXT: mov.b16 %rs3, 0x3F80;
-; CHECK-NEXT: atom.add.noftz.bf16 %rs4, [%r1], %rs3;
-; CHECK-NEXT: ld.param.u32 %r3, [test_param_2];
-; CHECK-NEXT: atom.global.add.noftz.bf16 %rs5, [%r2], %rs1;
-; CHECK-NEXT: atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1;
-; CHECK-NEXT: ret;
+; PTX78-LABEL: test(
+; PTX78: {
+; PTX78-NEXT: .reg .b16 %rs<7>;
+; PTX78-NEXT: .reg .b64 %rd<4>;
+; PTX78-EMPTY:
+; PTX78-NEXT: // %bb.0:
+; PTX78-NEXT: ld.param.u64 %rd1, [test_param_0];
+; PTX78-NEXT: ld.param.b16 %rs1, [test_param_3];
+; PTX78-NEXT: atom.add.noftz.bf16 %rs2, [%rd1], %rs1;
+; PTX78-NEXT: ld.param.u64 %rd2, [test_param_1];
+; PTX78-NEXT: mov.b16 %rs3, 0x3F80;
+; PTX78-NEXT: atom.add.noftz.bf16 %rs4, [%rd1], %rs3;
+; PTX78-NEXT: ld.param.u64 %rd3, [test_param_2];
+; PTX78-NEXT: atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1;
+; PTX78-NEXT: atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1;
+; PTX78-NEXT: ret;
;
-; CHECK64-LABEL: test(
-; CHECK64: {
-; CHECK64-NEXT: .reg .b16 %rs<7>;
-; CHECK64-NEXT: .reg .b64 %rd<4>;
-; CHECK64-EMPTY:
-; CHECK64-NEXT: // %bb.0:
-; CHECK64-NEXT: ld.param.u64 %rd1, [test_param_0];
-; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3];
-; CHECK64-NEXT: atom.add.noftz.bf16 %rs2, [%rd1], %rs1;
-; CHECK64-NEXT: ld.param.u64 %rd2, [test_param_1];
-; CHECK64-NEXT: mov.b16 %rs3, 0x3F80;
-; CHECK64-NEXT: atom.add.noftz.bf16 %rs4, [%rd1], %rs3;
-; CHECK64-NEXT: ld.param.u64 %rd3, [test_param_2];
-; CHECK64-NEXT: atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1;
-; CHECK64-NEXT: atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1;
-; CHECK64-NEXT: ret;
-;
-; CHECKPTX71-LABEL: test(
-; CHECKPTX71: {
-; CHECKPTX71-NEXT: .reg .pred %p<5>;
-; CHECKPTX71-NEXT: .reg .b16 %rs<14>;
-; CHECKPTX71-NEXT: .reg .b32 %r<58>;
-; CHECKPTX71-EMPTY:
-; CHECKPTX71-NEXT: // %bb.0:
-; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
-; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
-; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
-; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
-; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
-; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
-; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
-; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
-; CHECKPTX71-NEXT: not.b32 %r3, %r27;
-; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
-; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45
-; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
-; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80;
-; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
-; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
-; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
-; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
-; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
-; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
-; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
-; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
-; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
-; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
-; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r33;
-; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80;
-; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs7;
-; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
-; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
-; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
-; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
-; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
-; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
-; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26
-; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
-; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
-; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
-; CHECKPTX71-NEXT: mov.b32 %r39, 65535;
-; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11;
-; CHECKPTX71-NEXT: not.b32 %r12, %r40;
-; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
-; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
-; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r41;
-; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80;
-; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs10;
-; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
-; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
-; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
-; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
-; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
-; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
-; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8
-; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
-; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
-; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
-; CHECKPTX71-NEXT: mov.b32 %r47, 65535;
-; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17;
-; CHECKPTX71-NEXT: not.b32 %r18, %r48;
-; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
-; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
-; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r49;
-; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80;
-; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs13;
-; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
-; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
-; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
-; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
-; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
-; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
-; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
-; CHECKPTX71-NEXT: ret;
+; PTX71-LABEL: test(
+; PTX71: {
+; PTX71-NEXT: .reg .pred %p<5>;
+; PTX71-NEXT: .reg .b16 %rs<14>;
+; PTX71-NEXT: .reg .b32 %r<55>;
+; PTX71-NEXT: .reg .b64 %rd<7>;
+; PTX71-EMPTY:
+; PTX71-NEXT: // %bb.0:
+; PTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
+; PTX71-NEXT: ld.param.u64 %rd5, [test_param_2];
+; PTX71-NEXT: ld.param.u64 %rd4, [test_param_1];
+; PTX71-NEXT: ld.param.u64 %rd6, [test_param_0];
+; PTX71-NEXT: and.b64 %rd1, %rd6, -4;
+; PTX71-NEXT: cvt.u32.u64 %r19, %rd6;
+; PTX71-NEXT: and.b32 %r20, %r19, 3;
+; PTX71-NEXT: shl.b32 %r1, %r20, 3;
+; PTX71-NEXT: mov.b32 %r21, 65535;
+; PTX71-NEXT: shl.b32 %r22, %r21, %r1;
+; PTX71-NEXT: not.b32 %r2, %r22;
+; PTX71-NEXT: ld.u32 %r51, [%rd1];
+; PTX71-NEXT: $L__BB0_1: // %atomicrmw.start48
+; PTX71-NEXT: // =>This Inner Loop Header: Depth=1
+; PTX71-NEXT: shr.u32 %r23, %r51, %r1;
+; PTX71-NEXT: cvt.u16.u32 %rs2, %r23;
+; PTX71-NEXT: mov.b16 %rs3, 0x3F80;
+; PTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
+; PTX71-NEXT: cvt.u32.u16 %r24, %rs4;
+; PTX71-NEXT: shl.b32 %r25, %r24, %r1;
+; PTX71-NEXT: and.b32 %r26, %r51, %r2;
+; PTX71-NEXT: or.b32 %r27, %r26, %r25;
+; PTX71-NEXT: atom.cas.b32 %r5, [%rd1], %r51, %r27;
+; PTX71-NEXT: setp.ne.s32 %p1, %r5, %r51;
+; PTX71-NEXT: mov.u32 %r51, %r5;
+; PTX71-NEXT: @%p1 bra $L__BB0_1;
+; PTX71-NEXT: // %bb.2: // %atomicrmw.end47
+; PTX71-NEXT: ld.u32 %r52, [%rd1];
+; PTX71-NEXT: $L__BB0_3: // %atomicrmw.start29
+; PTX71-NEXT: // =>This Inner Loop Header: Depth=1
+; PTX71-NEXT: shr.u32 %r28, %r52, %r1;
+; PTX71-NEXT: cvt.u16.u32 %rs5, %r28;
+; PTX71-NEXT: mov.b16 %rs6, 0x3F80;
+; PTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
+; PTX71-NEXT: cvt.u32.u16 %r29, %rs7;
+; PTX71-NEXT: shl.b32 %r30, %r29, %r1;
+; PTX71-NEXT: and.b32 %r31, %r52, %r2;
+; PTX71-NEXT: or.b32 %r32, %r31, %r30;
+; PTX71-NEXT: atom.cas.b32 %r8, [%rd1], %r52, %r32;
+; PTX71-NEXT: setp.ne.s32 %p2, %r8, %r52;
+; PTX71-NEXT: mov.u32 %r52, %r8;
+; PTX71-NEXT: @%p2 bra $L__BB0_3;
+; PTX71-NEXT: // %bb.4: // %atomicrmw.end28
+; PTX71-NEXT: and.b64 %rd2, %rd4, -4;
+; PTX71-NEXT: cvt.u32.u64 %r33, %rd4;
+; PTX71-NEXT: and.b32 %r34, %r33, 3;
+; PTX71-NEXT: shl.b32 %r9, %r34, 3;
+; PTX71-NEXT: mov.b32 %r35, 65535;
+; PTX71-NEXT: shl.b32 %r36, %r35, %r9;
+; PTX71-NEXT: not.b32 %r10, %r36;
+; PTX71-NEXT: ld.global.u32 %r53, [%rd2];
+; PTX71-NEXT: $L__BB0_5: // %atomicrmw.start10
+; PTX71-NEXT: // =>This Inner Loop Header: Depth=1
+; PTX71-NEXT: shr.u32 %r37, %r53, %r9;
+; PTX71-NEXT: cvt.u16.u32 %rs8, %r37;
+; PTX71-NEXT: mov.b16 %rs9, 0x3F80;
+; PTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
+; PTX71-NEXT: cvt.u32.u16 %r38, %rs10;
+; PTX71-NEXT: shl.b32 %r39, %r38, %r9;
+; PTX71-NEXT: and.b32 %r40, %r53, %r10;
+; PTX71-NEXT: or.b32 %r41, %r40, %r39;
+; PTX71-NEXT: atom.global.cas.b32 %r13, [%rd2], %r53, %r41;
+; PTX71-NEXT: setp.ne.s32 %p3, %r13, %r53;
+; PTX71-NEXT: mov.u32 %r53, %r13;
+; PTX71-NEXT: @%p3 bra $L__BB0_5;
+; PTX71-NEXT: // %bb.6: // %atomicrmw.end9
+; PTX71-NEXT: and.b64 %rd3, %rd5, -4;
+; PTX71-NEXT: cvt.u32.u64 %r42, %rd5;
+; PTX71-NEXT: and.b32 %r43, %r42, 3;
+; PTX71-NEXT: shl.b32 %r14, %r43, 3;
+; PTX71-NEXT: mov.b32 %r44, 65535;
+; PTX71-NEXT: shl.b32 %r45, %r44, %r14;
+; PTX71-NEXT: not.b32 %r15, %r45;
+; PTX71-NEXT: ld.shared.u32 %r54, [%rd3];
+; PTX71-NEXT: $L__BB0_7: // %atomicrmw.start
+; PTX71-NEXT: // =>This Inner Loop Header: Depth=1
+; PTX71-NEXT: shr.u32 %r46, %r54, %r14;
+; PTX71-NEXT: cvt.u16.u32 %rs11, %r46;
+; PTX71-NEXT: mov.b16 %rs12, 0x3F80;
+; PTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
+; PTX71-NEXT: cvt.u32.u16 %r47, %rs13;
+; PTX71-NEXT: shl.b32 %r48, %r47, %r14;
+; PTX71-NEXT: and.b32 %r49, %r54, %r15;
+; PTX71-NEXT: or.b32 %r50, %r49, %r48;
+; PTX71-NEXT: atom.shared.cas.b32 %r18, [%rd3], %r54, %r50;
+; PTX71-NEXT: setp.ne.s32 %p4, %r18, %r54;
+; PTX71-NEXT: mov.u32 %r54, %r18;
+; PTX71-NEXT: @%p4 bra $L__BB0_7;
+; PTX71-NEXT: // %bb.8: // %atomicrmw.end
+; PTX71-NEXT: ret;
%r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic
%r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic
%r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic
``````````
</details>
https://github.com/llvm/llvm-project/pull/125968
More information about the llvm-commits
mailing list