[llvm] [NVPTX] simplified atomics-sm90.ll test (PR #125968)

Artem Belevich via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 5 16:08:37 PST 2025


https://github.com/Artem-B created https://github.com/llvm/llvm-project/pull/125968

There's no point running 32-bit ptxas tests for new GPUs. New CUDA versions no longer support 32-bit compilation.

Also set correct version constraint on ptxas so the test does not fail with older CUDA versions.



>From 192fe92242032221798eeb218557124c7b904840 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Wed, 5 Feb 2025 15:39:39 -0800
Subject: [PATCH] [NVPTX] simplified atomics-sm90.ll test

..and fixed ptxas test runs that didn't have correct constraints on ptxas version.
---
 llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 251 +++++++++++-------------
 1 file changed, 118 insertions(+), 133 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 67abfe8295a6231..139da9befc4b78b 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -1,142 +1,127 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK64
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefixes=CHECKPTX71
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefix=PTX78
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefix=PTX71
+; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-11.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %}
 
 target triple = "nvptx64-nvidia-cuda"
 
 define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat %val) {
-; CHECK-LABEL: test(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<7>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECK-NEXT:    atom.add.noftz.bf16 %rs2, [%r1], %rs1;
-; CHECK-NEXT:    ld.param.u32 %r2, [test_param_1];
-; CHECK-NEXT:    mov.b16 %rs3, 0x3F80;
-; CHECK-NEXT:    atom.add.noftz.bf16 %rs4, [%r1], %rs3;
-; CHECK-NEXT:    ld.param.u32 %r3, [test_param_2];
-; CHECK-NEXT:    atom.global.add.noftz.bf16 %rs5, [%r2], %rs1;
-; CHECK-NEXT:    atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1;
-; CHECK-NEXT:    ret;
+; PTX78-LABEL: test(
+; PTX78:       {
+; PTX78-NEXT:    .reg .b16 %rs<7>;
+; PTX78-NEXT:    .reg .b64 %rd<4>;
+; PTX78-EMPTY:
+; PTX78-NEXT:  // %bb.0:
+; PTX78-NEXT:    ld.param.u64 %rd1, [test_param_0];
+; PTX78-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; PTX78-NEXT:    atom.add.noftz.bf16 %rs2, [%rd1], %rs1;
+; PTX78-NEXT:    ld.param.u64 %rd2, [test_param_1];
+; PTX78-NEXT:    mov.b16 %rs3, 0x3F80;
+; PTX78-NEXT:    atom.add.noftz.bf16 %rs4, [%rd1], %rs3;
+; PTX78-NEXT:    ld.param.u64 %rd3, [test_param_2];
+; PTX78-NEXT:    atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1;
+; PTX78-NEXT:    atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1;
+; PTX78-NEXT:    ret;
 ;
-; CHECK64-LABEL: test(
-; CHECK64:       {
-; CHECK64-NEXT:    .reg .b16 %rs<7>;
-; CHECK64-NEXT:    .reg .b64 %rd<4>;
-; CHECK64-EMPTY:
-; CHECK64-NEXT:  // %bb.0:
-; CHECK64-NEXT:    ld.param.u64 %rd1, [test_param_0];
-; CHECK64-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECK64-NEXT:    atom.add.noftz.bf16 %rs2, [%rd1], %rs1;
-; CHECK64-NEXT:    ld.param.u64 %rd2, [test_param_1];
-; CHECK64-NEXT:    mov.b16 %rs3, 0x3F80;
-; CHECK64-NEXT:    atom.add.noftz.bf16 %rs4, [%rd1], %rs3;
-; CHECK64-NEXT:    ld.param.u64 %rd3, [test_param_2];
-; CHECK64-NEXT:    atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1;
-; CHECK64-NEXT:    atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1;
-; CHECK64-NEXT:    ret;
-;
-; CHECKPTX71-LABEL: test(
-; CHECKPTX71:       {
-; CHECKPTX71-NEXT:    .reg .pred %p<5>;
-; CHECKPTX71-NEXT:    .reg .b16 %rs<14>;
-; CHECKPTX71-NEXT:    .reg .b32 %r<58>;
-; CHECKPTX71-EMPTY:
-; CHECKPTX71-NEXT:  // %bb.0:
-; CHECKPTX71-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX71-NEXT:    ld.param.u32 %r23, [test_param_2];
-; CHECKPTX71-NEXT:    ld.param.u32 %r22, [test_param_1];
-; CHECKPTX71-NEXT:    ld.param.u32 %r24, [test_param_0];
-; CHECKPTX71-NEXT:    and.b32 %r1, %r24, -4;
-; CHECKPTX71-NEXT:    and.b32 %r25, %r24, 3;
-; CHECKPTX71-NEXT:    shl.b32 %r2, %r25, 3;
-; CHECKPTX71-NEXT:    mov.b32 %r26, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
-; CHECKPTX71-NEXT:    not.b32 %r3, %r27;
-; CHECKPTX71-NEXT:    ld.u32 %r54, [%r1];
-; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start45
-; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r28, %r54, %r2;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r28;
-; CHECKPTX71-NEXT:    mov.b16 %rs3, 0x3F80;
-; CHECKPTX71-NEXT:    fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r29, %rs4;
-; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
-; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT:    atom.cas.b32 %r6, [%r1], %r54, %r32;
-; CHECKPTX71-NEXT:    setp.ne.s32 %p1, %r6, %r54;
-; CHECKPTX71-NEXT:    mov.u32 %r54, %r6;
-; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
-; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end44
-; CHECKPTX71-NEXT:    ld.u32 %r55, [%r1];
-; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start27
-; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r33, %r55, %r2;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs5, %r33;
-; CHECKPTX71-NEXT:    mov.b16 %rs6, 0x3F80;
-; CHECKPTX71-NEXT:    fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs7;
-; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
-; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT:    atom.cas.b32 %r9, [%r1], %r55, %r37;
-; CHECKPTX71-NEXT:    setp.ne.s32 %p2, %r9, %r55;
-; CHECKPTX71-NEXT:    mov.u32 %r55, %r9;
-; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
-; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end26
-; CHECKPTX71-NEXT:    and.b32 %r10, %r22, -4;
-; CHECKPTX71-NEXT:    shl.b32 %r38, %r22, 3;
-; CHECKPTX71-NEXT:    and.b32 %r11, %r38, 24;
-; CHECKPTX71-NEXT:    mov.b32 %r39, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r40, %r39, %r11;
-; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
-; CHECKPTX71-NEXT:    ld.global.u32 %r56, [%r10];
-; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start9
-; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r41, %r56, %r11;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs8, %r41;
-; CHECKPTX71-NEXT:    mov.b16 %rs9, 0x3F80;
-; CHECKPTX71-NEXT:    fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs10;
-; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
-; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
-; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT:    atom.global.cas.b32 %r15, [%r10], %r56, %r45;
-; CHECKPTX71-NEXT:    setp.ne.s32 %p3, %r15, %r56;
-; CHECKPTX71-NEXT:    mov.u32 %r56, %r15;
-; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
-; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end8
-; CHECKPTX71-NEXT:    and.b32 %r16, %r23, -4;
-; CHECKPTX71-NEXT:    shl.b32 %r46, %r23, 3;
-; CHECKPTX71-NEXT:    and.b32 %r17, %r46, 24;
-; CHECKPTX71-NEXT:    mov.b32 %r47, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r48, %r47, %r17;
-; CHECKPTX71-NEXT:    not.b32 %r18, %r48;
-; CHECKPTX71-NEXT:    ld.shared.u32 %r57, [%r16];
-; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
-; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r49, %r57, %r17;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs11, %r49;
-; CHECKPTX71-NEXT:    mov.b16 %rs12, 0x3F80;
-; CHECKPTX71-NEXT:    fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r50, %rs13;
-; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
-; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
-; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT:    atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
-; CHECKPTX71-NEXT:    setp.ne.s32 %p4, %r21, %r57;
-; CHECKPTX71-NEXT:    mov.u32 %r57, %r21;
-; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
-; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end
-; CHECKPTX71-NEXT:    ret;
+; PTX71-LABEL: test(
+; PTX71:       {
+; PTX71-NEXT:    .reg .pred %p<5>;
+; PTX71-NEXT:    .reg .b16 %rs<14>;
+; PTX71-NEXT:    .reg .b32 %r<55>;
+; PTX71-NEXT:    .reg .b64 %rd<7>;
+; PTX71-EMPTY:
+; PTX71-NEXT:  // %bb.0:
+; PTX71-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; PTX71-NEXT:    ld.param.u64 %rd5, [test_param_2];
+; PTX71-NEXT:    ld.param.u64 %rd4, [test_param_1];
+; PTX71-NEXT:    ld.param.u64 %rd6, [test_param_0];
+; PTX71-NEXT:    and.b64 %rd1, %rd6, -4;
+; PTX71-NEXT:    cvt.u32.u64 %r19, %rd6;
+; PTX71-NEXT:    and.b32 %r20, %r19, 3;
+; PTX71-NEXT:    shl.b32 %r1, %r20, 3;
+; PTX71-NEXT:    mov.b32 %r21, 65535;
+; PTX71-NEXT:    shl.b32 %r22, %r21, %r1;
+; PTX71-NEXT:    not.b32 %r2, %r22;
+; PTX71-NEXT:    ld.u32 %r51, [%rd1];
+; PTX71-NEXT:  $L__BB0_1: // %atomicrmw.start48
+; PTX71-NEXT:    // =>This Inner Loop Header: Depth=1
+; PTX71-NEXT:    shr.u32 %r23, %r51, %r1;
+; PTX71-NEXT:    cvt.u16.u32 %rs2, %r23;
+; PTX71-NEXT:    mov.b16 %rs3, 0x3F80;
+; PTX71-NEXT:    fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
+; PTX71-NEXT:    cvt.u32.u16 %r24, %rs4;
+; PTX71-NEXT:    shl.b32 %r25, %r24, %r1;
+; PTX71-NEXT:    and.b32 %r26, %r51, %r2;
+; PTX71-NEXT:    or.b32 %r27, %r26, %r25;
+; PTX71-NEXT:    atom.cas.b32 %r5, [%rd1], %r51, %r27;
+; PTX71-NEXT:    setp.ne.s32 %p1, %r5, %r51;
+; PTX71-NEXT:    mov.u32 %r51, %r5;
+; PTX71-NEXT:    @%p1 bra $L__BB0_1;
+; PTX71-NEXT:  // %bb.2: // %atomicrmw.end47
+; PTX71-NEXT:    ld.u32 %r52, [%rd1];
+; PTX71-NEXT:  $L__BB0_3: // %atomicrmw.start29
+; PTX71-NEXT:    // =>This Inner Loop Header: Depth=1
+; PTX71-NEXT:    shr.u32 %r28, %r52, %r1;
+; PTX71-NEXT:    cvt.u16.u32 %rs5, %r28;
+; PTX71-NEXT:    mov.b16 %rs6, 0x3F80;
+; PTX71-NEXT:    fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
+; PTX71-NEXT:    cvt.u32.u16 %r29, %rs7;
+; PTX71-NEXT:    shl.b32 %r30, %r29, %r1;
+; PTX71-NEXT:    and.b32 %r31, %r52, %r2;
+; PTX71-NEXT:    or.b32 %r32, %r31, %r30;
+; PTX71-NEXT:    atom.cas.b32 %r8, [%rd1], %r52, %r32;
+; PTX71-NEXT:    setp.ne.s32 %p2, %r8, %r52;
+; PTX71-NEXT:    mov.u32 %r52, %r8;
+; PTX71-NEXT:    @%p2 bra $L__BB0_3;
+; PTX71-NEXT:  // %bb.4: // %atomicrmw.end28
+; PTX71-NEXT:    and.b64 %rd2, %rd4, -4;
+; PTX71-NEXT:    cvt.u32.u64 %r33, %rd4;
+; PTX71-NEXT:    and.b32 %r34, %r33, 3;
+; PTX71-NEXT:    shl.b32 %r9, %r34, 3;
+; PTX71-NEXT:    mov.b32 %r35, 65535;
+; PTX71-NEXT:    shl.b32 %r36, %r35, %r9;
+; PTX71-NEXT:    not.b32 %r10, %r36;
+; PTX71-NEXT:    ld.global.u32 %r53, [%rd2];
+; PTX71-NEXT:  $L__BB0_5: // %atomicrmw.start10
+; PTX71-NEXT:    // =>This Inner Loop Header: Depth=1
+; PTX71-NEXT:    shr.u32 %r37, %r53, %r9;
+; PTX71-NEXT:    cvt.u16.u32 %rs8, %r37;
+; PTX71-NEXT:    mov.b16 %rs9, 0x3F80;
+; PTX71-NEXT:    fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
+; PTX71-NEXT:    cvt.u32.u16 %r38, %rs10;
+; PTX71-NEXT:    shl.b32 %r39, %r38, %r9;
+; PTX71-NEXT:    and.b32 %r40, %r53, %r10;
+; PTX71-NEXT:    or.b32 %r41, %r40, %r39;
+; PTX71-NEXT:    atom.global.cas.b32 %r13, [%rd2], %r53, %r41;
+; PTX71-NEXT:    setp.ne.s32 %p3, %r13, %r53;
+; PTX71-NEXT:    mov.u32 %r53, %r13;
+; PTX71-NEXT:    @%p3 bra $L__BB0_5;
+; PTX71-NEXT:  // %bb.6: // %atomicrmw.end9
+; PTX71-NEXT:    and.b64 %rd3, %rd5, -4;
+; PTX71-NEXT:    cvt.u32.u64 %r42, %rd5;
+; PTX71-NEXT:    and.b32 %r43, %r42, 3;
+; PTX71-NEXT:    shl.b32 %r14, %r43, 3;
+; PTX71-NEXT:    mov.b32 %r44, 65535;
+; PTX71-NEXT:    shl.b32 %r45, %r44, %r14;
+; PTX71-NEXT:    not.b32 %r15, %r45;
+; PTX71-NEXT:    ld.shared.u32 %r54, [%rd3];
+; PTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
+; PTX71-NEXT:    // =>This Inner Loop Header: Depth=1
+; PTX71-NEXT:    shr.u32 %r46, %r54, %r14;
+; PTX71-NEXT:    cvt.u16.u32 %rs11, %r46;
+; PTX71-NEXT:    mov.b16 %rs12, 0x3F80;
+; PTX71-NEXT:    fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
+; PTX71-NEXT:    cvt.u32.u16 %r47, %rs13;
+; PTX71-NEXT:    shl.b32 %r48, %r47, %r14;
+; PTX71-NEXT:    and.b32 %r49, %r54, %r15;
+; PTX71-NEXT:    or.b32 %r50, %r49, %r48;
+; PTX71-NEXT:    atom.shared.cas.b32 %r18, [%rd3], %r54, %r50;
+; PTX71-NEXT:    setp.ne.s32 %p4, %r18, %r54;
+; PTX71-NEXT:    mov.u32 %r54, %r18;
+; PTX71-NEXT:    @%p4 bra $L__BB0_7;
+; PTX71-NEXT:  // %bb.8: // %atomicrmw.end
+; PTX71-NEXT:    ret;
   %r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic
   %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic
   %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic



More information about the llvm-commits mailing list