[llvm] [NVPTX] Skip numbering unreferenced virtual registers (readability) (PR #154391)

via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 19 10:15:38 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

<details>
<summary>Changes</summary>

When assigning numbers to registers, skip any with neither uses nor defs. This is will not have any impact at all on the final SASS but it makes for slightly more readable and consistent across minor changes PTX. 

---

Patch is 488.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154391.diff


45 Files Affected:

- (modified) llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp (+7-8) 
- (modified) llvm/test/CodeGen/NVPTX/aggregate-return.ll (+10-10) 
- (modified) llvm/test/CodeGen/NVPTX/atomics-sm70.ll (+62-62) 
- (modified) llvm/test/CodeGen/NVPTX/atomics-sm90.ll (+62-62) 
- (modified) llvm/test/CodeGen/NVPTX/atomics.ll (+23-23) 
- (modified) llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll (+1-1) 
- (modified) llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll (+1-1) 
- (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll (+711-711) 
- (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll (+711-711) 
- (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll (+711-711) 
- (modified) llvm/test/CodeGen/NVPTX/cmpxchg.ll (+430-430) 
- (modified) llvm/test/CodeGen/NVPTX/combine-mad.ll (+1-1) 
- (modified) llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll (+62-62) 
- (modified) llvm/test/CodeGen/NVPTX/cse-mov-sym.ll (+12-12) 
- (modified) llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll (+58-58) 
- (modified) llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll (+2-2) 
- (modified) llvm/test/CodeGen/NVPTX/extractelement.ll (+4-4) 
- (modified) llvm/test/CodeGen/NVPTX/f16x2-instructions.ll (+3-3) 
- (modified) llvm/test/CodeGen/NVPTX/f32x2-instructions.ll (+3-3) 
- (modified) llvm/test/CodeGen/NVPTX/fma.ll (+2-2) 
- (modified) llvm/test/CodeGen/NVPTX/forward-ld-param.ll (+4-7) 
- (modified) llvm/test/CodeGen/NVPTX/i1-select.ll (+19-19) 
- (modified) llvm/test/CodeGen/NVPTX/i128-array.ll (+5-5) 
- (modified) llvm/test/CodeGen/NVPTX/i128.ll (+381-381) 
- (modified) llvm/test/CodeGen/NVPTX/i16x2-instructions.ll (+3-3) 
- (modified) llvm/test/CodeGen/NVPTX/i8x2-instructions.ll (+2-2) 
- (modified) llvm/test/CodeGen/NVPTX/i8x4-instructions.ll (+6-6) 
- (modified) llvm/test/CodeGen/NVPTX/indirect_byval.ll (+9-9) 
- (modified) llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll (+10-10) 
- (modified) llvm/test/CodeGen/NVPTX/jump-table.ll (+4-4) 
- (modified) llvm/test/CodeGen/NVPTX/ld-param-sink.ll (+3-3) 
- (modified) llvm/test/CodeGen/NVPTX/ldparam-v4.ll (+1-1) 
- (modified) llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll (+28-28) 
- (modified) llvm/test/CodeGen/NVPTX/local-stack-frame.ll (+15-15) 
- (modified) llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll (+12-12) 
- (modified) llvm/test/CodeGen/NVPTX/lower-byval-args.ll (+72-73) 
- (modified) llvm/test/CodeGen/NVPTX/misched_func_call.ll (+11-11) 
- (modified) llvm/test/CodeGen/NVPTX/param-add.ll (+1-1) 
- (modified) llvm/test/CodeGen/NVPTX/param-overalign.ll (+2-2) 
- (modified) llvm/test/CodeGen/NVPTX/surf-read-cuda.ll (+1-1) 
- (modified) llvm/test/CodeGen/NVPTX/surf-write-cuda.ll (-1) 
- (modified) llvm/test/CodeGen/NVPTX/tex-read-cuda.ll (+4-4) 
- (modified) llvm/test/CodeGen/NVPTX/texsurf-queries.ll (-4) 
- (modified) llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll (+117-117) 
- (modified) llvm/test/CodeGen/NVPTX/variadics-backend.ll (+48-48) 


``````````diff
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 38912a7f09e30..077fb56910c07 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1458,7 +1458,6 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   // Map the global virtual register number to a register class specific
   // virtual register number starting from 1 with that class.
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  //unsigned numRegClasses = TRI->getNumRegClasses();
 
   // Emit the Fake Stack Object
   const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1479,13 +1478,13 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   // global virtual
   // register number and the per class virtual register number.
   // We use the per class virtual register number in the ptx output.
-  unsigned int numVRs = MRI->getNumVirtRegs();
-  for (unsigned i = 0; i < numVRs; i++) {
-    Register vr = Register::index2VirtReg(i);
-    const TargetRegisterClass *RC = MRI->getRegClass(vr);
-    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
-    int n = regmap.size();
-    regmap.insert(std::make_pair(vr, n + 1));
+  for (unsigned I : llvm::seq(MRI->getNumVirtRegs())) {
+    Register VR = Register::index2VirtReg(I);
+    if (MRI->use_empty(VR) && MRI->def_empty(VR))
+      continue;
+    const TargetRegisterClass *RC = MRI->getRegClass(VR);
+    DenseMap<unsigned, unsigned> &RCRegMap = VRegMapping[RC];
+    RCRegMap.insert(std::make_pair(VR, RCRegMap.size() + 1));
   }
 
   // Emit declaration of the virtual registers or 'physical' registers for
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index abc873e2aa706..bf51973e88357 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -10,7 +10,7 @@ declare {float, float} @bars({float, float} %input)
 define void @test_v2f32(<2 x float> %input, ptr %output) {
 ; CHECK-LABEL: test_v2f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_v2f32_param_0];
@@ -21,8 +21,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
 ; CHECK-NEXT:    call.uni (retval0), barv, (param0);
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    ld.param.b64 %rd4, [test_v2f32_param_1];
-; CHECK-NEXT:    st.b64 [%rd4], %rd2;
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_v2f32_param_1];
+; CHECK-NEXT:    st.b64 [%rd3], %rd2;
 ; CHECK-NEXT:    ret;
   %call = tail call <2 x float> @barv(<2 x float> %input)
   store <2 x float> %call, ptr %output, align 8
@@ -32,8 +32,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
 define void @test_v3f32(<3 x float> %input, ptr %output) {
 ; CHECK-LABEL: test_v3f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_v3f32_param_0];
@@ -47,9 +47,9 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
 ; CHECK-NEXT:    ld.param.b32 %r2, [retval0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
 ; CHECK-NEXT:    } // callseq 1
-; CHECK-NEXT:    ld.param.b64 %rd4, [test_v3f32_param_1];
-; CHECK-NEXT:    st.b32 [%rd4+8], %r2;
-; CHECK-NEXT:    st.b64 [%rd4], %rd2;
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_v3f32_param_1];
+; CHECK-NEXT:    st.b32 [%rd3+8], %r2;
+; CHECK-NEXT:    st.b64 [%rd3], %rd2;
 ; CHECK-NEXT:    ret;
   %call = tail call <3 x float> @barv3(<3 x float> %input)
 ; Make sure we don't load more values than than we need to.
@@ -60,7 +60,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
 define void @test_a2f32([2 x float] %input, ptr %output) {
 ; CHECK-LABEL: test_a2f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -87,7 +87,7 @@ define void @test_a2f32([2 x float] %input, ptr %output) {
 define void @test_s2f32({float, float} %input, ptr %output) {
 ; CHECK-LABEL: test_s2f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index f710d7f883a1b..5f4856acb317c 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -47,90 +47,90 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62:       {
 ; CHECKPTX62-NEXT:    .reg .pred %p<5>;
 ; CHECKPTX62-NEXT:    .reg .b16 %rs<11>;
-; CHECKPTX62-NEXT:    .reg .b32 %r<58>;
+; CHECKPTX62-NEXT:    .reg .b32 %r<50>;
 ; CHECKPTX62-EMPTY:
 ; CHECKPTX62-NEXT:  // %bb.0:
 ; CHECKPTX62-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX62-NEXT:    ld.param.b32 %r23, [test_param_2];
-; CHECKPTX62-NEXT:    ld.param.b32 %r22, [test_param_1];
-; CHECKPTX62-NEXT:    ld.param.b32 %r24, [test_param_0];
-; CHECKPTX62-NEXT:    and.b32 %r1, %r24, -4;
-; CHECKPTX62-NEXT:    and.b32 %r25, %r24, 3;
-; CHECKPTX62-NEXT:    shl.b32 %r2, %r25, 3;
-; CHECKPTX62-NEXT:    mov.b32 %r26, 65535;
-; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
-; CHECKPTX62-NEXT:    not.b32 %r3, %r27;
-; CHECKPTX62-NEXT:    ld.b32 %r54, [%r1];
+; CHECKPTX62-NEXT:    ld.param.b32 %r15, [test_param_2];
+; CHECKPTX62-NEXT:    ld.param.b32 %r14, [test_param_1];
+; CHECKPTX62-NEXT:    ld.param.b32 %r16, [test_param_0];
+; CHECKPTX62-NEXT:    and.b32 %r1, %r16, -4;
+; CHECKPTX62-NEXT:    and.b32 %r17, %r16, 3;
+; CHECKPTX62-NEXT:    shl.b32 %r2, %r17, 3;
+; CHECKPTX62-NEXT:    mov.b32 %r18, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r19, %r18, %r2;
+; CHECKPTX62-NEXT:    not.b32 %r3, %r19;
+; CHECKPTX62-NEXT:    ld.b32 %r46, [%r1];
 ; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r28, %r54, %r2;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r28;
+; CHECKPTX62-NEXT:    shr.u32 %r20, %r46, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r20;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs3, %rs2, %rs1;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r29, %rs3;
-; CHECKPTX62-NEXT:    shl.b32 %r30, %r29, %r2;
-; CHECKPTX62-NEXT:    and.b32 %r31, %r54, %r3;
-; CHECKPTX62-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p1, %r6, %r54;
-; CHECKPTX62-NEXT:    mov.b32 %r54, %r6;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r21, %rs3;
+; CHECKPTX62-NEXT:    shl.b32 %r22, %r21, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r23, %r46, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r24, %r23, %r22;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p1, %r4, %r46;
+; CHECKPTX62-NEXT:    mov.b32 %r46, %r4;
 ; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end44
-; CHECKPTX62-NEXT:    ld.b32 %r55, [%r1];
+; CHECKPTX62-NEXT:    ld.b32 %r47, [%r1];
 ; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r33, %r55, %r2;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs4, %r33;
+; CHECKPTX62-NEXT:    shr.u32 %r25, %r47, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs4, %r25;
 ; CHECKPTX62-NEXT:    mov.b16 %rs5, 0x3C00;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs6, %rs4, %rs5;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs6;
-; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r2;
-; CHECKPTX62-NEXT:    and.b32 %r36, %r55, %r3;
-; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p2, %r9, %r55;
-; CHECKPTX62-NEXT:    mov.b32 %r55, %r9;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r26, %rs6;
+; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r28, %r47, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r29, %r28, %r27;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p2, %r5, %r47;
+; CHECKPTX62-NEXT:    mov.b32 %r47, %r5;
 ; CHECKPTX62-NEXT:    @%p2 bra $L__BB0_3;
 ; CHECKPTX62-NEXT:  // %bb.4: // %atomicrmw.end26
-; CHECKPTX62-NEXT:    and.b32 %r10, %r22, -4;
-; CHECKPTX62-NEXT:    shl.b32 %r38, %r22, 3;
-; CHECKPTX62-NEXT:    and.b32 %r11, %r38, 24;
-; CHECKPTX62-NEXT:    mov.b32 %r39, 65535;
-; CHECKPTX62-NEXT:    shl.b32 %r40, %r39, %r11;
-; CHECKPTX62-NEXT:    not.b32 %r12, %r40;
-; CHECKPTX62-NEXT:    ld.global.b32 %r56, [%r10];
+; CHECKPTX62-NEXT:    and.b32 %r6, %r14, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r30, %r14, 3;
+; CHECKPTX62-NEXT:    and.b32 %r7, %r30, 24;
+; CHECKPTX62-NEXT:    mov.b32 %r31, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r32, %r31, %r7;
+; CHECKPTX62-NEXT:    not.b32 %r8, %r32;
+; CHECKPTX62-NEXT:    ld.global.b32 %r48, [%r6];
 ; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r41, %r56, %r11;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs7, %r41;
+; CHECKPTX62-NEXT:    shr.u32 %r33, %r48, %r7;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs7, %r33;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs8, %rs7, %rs1;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs8;
-; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r11;
-; CHECKPTX62-NEXT:    and.b32 %r44, %r56, %r12;
-; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p3, %r15, %r56;
-; CHECKPTX62-NEXT:    mov.b32 %r56, %r15;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs8;
+; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r7;
+; CHECKPTX62-NEXT:    and.b32 %r36, %r48, %r8;
+; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p3, %r9, %r48;
+; CHECKPTX62-NEXT:    mov.b32 %r48, %r9;
 ; CHECKPTX62-NEXT:    @%p3 bra $L__BB0_5;
 ; CHECKPTX62-NEXT:  // %bb.6: // %atomicrmw.end8
-; CHECKPTX62-NEXT:    and.b32 %r16, %r23, -4;
-; CHECKPTX62-NEXT:    shl.b32 %r46, %r23, 3;
-; CHECKPTX62-NEXT:    and.b32 %r17, %r46, 24;
-; CHECKPTX62-NEXT:    mov.b32 %r47, 65535;
-; CHECKPTX62-NEXT:    shl.b32 %r48, %r47, %r17;
-; CHECKPTX62-NEXT:    not.b32 %r18, %r48;
-; CHECKPTX62-NEXT:    ld.shared.b32 %r57, [%r16];
+; CHECKPTX62-NEXT:    and.b32 %r10, %r15, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r38, %r15, 3;
+; CHECKPTX62-NEXT:    and.b32 %r11, %r38, 24;
+; CHECKPTX62-NEXT:    mov.b32 %r39, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r40, %r39, %r11;
+; CHECKPTX62-NEXT:    not.b32 %r12, %r40;
+; CHECKPTX62-NEXT:    ld.shared.b32 %r49, [%r10];
 ; CHECKPTX62-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT:    shr.u32 %r49, %r57, %r17;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs9, %r49;
+; CHECKPTX62-NEXT:    shr.u32 %r41, %r49, %r11;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs9, %r41;
 ; CHECKPTX62-NEXT:    add.rn.f16 %rs10, %rs9, %rs1;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r50, %rs10;
-; CHECKPTX62-NEXT:    shl.b32 %r51, %r50, %r17;
-; CHECKPTX62-NEXT:    and.b32 %r52, %r57, %r18;
-; CHECKPTX62-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX62-NEXT:    atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
-; CHECKPTX62-NEXT:    setp.ne.b32 %p4, %r21, %r57;
-; CHECKPTX62-NEXT:    mov.b32 %r57, %r21;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs10;
+; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r11;
+; CHECKPTX62-NEXT:    and.b32 %r44, %r49, %r12;
+; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
+; CHECKPTX62-NEXT:    setp.ne.b32 %p4, %r13, %r49;
+; CHECKPTX62-NEXT:    mov.b32 %r49, %r13;
 ; CHECKPTX62-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX62-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX62-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index f96fd30019025..e560d4386c20d 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -47,93 +47,93 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71:       {
 ; CHECKPTX71-NEXT:    .reg .pred %p<5>;
 ; CHECKPTX71-NEXT:    .reg .b16 %rs<14>;
-; CHECKPTX71-NEXT:    .reg .b32 %r<58>;
+; CHECKPTX71-NEXT:    .reg .b32 %r<50>;
 ; CHECKPTX71-EMPTY:
 ; CHECKPTX71-NEXT:  // %bb.0:
 ; CHECKPTX71-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX71-NEXT:    ld.param.b32 %r23, [test_param_2];
-; CHECKPTX71-NEXT:    ld.param.b32 %r22, [test_param_1];
-; CHECKPTX71-NEXT:    ld.param.b32 %r24, [test_param_0];
-; CHECKPTX71-NEXT:    and.b32 %r1, %r24, -4;
-; CHECKPTX71-NEXT:    and.b32 %r25, %r24, 3;
-; CHECKPTX71-NEXT:    shl.b32 %r2, %r25, 3;
-; CHECKPTX71-NEXT:    mov.b32 %r26, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
-; CHECKPTX71-NEXT:    not.b32 %r3, %r27;
-; CHECKPTX71-NEXT:    ld.b32 %r54, [%r1];
+; CHECKPTX71-NEXT:    ld.param.b32 %r15, [test_param_2];
+; CHECKPTX71-NEXT:    ld.param.b32 %r14, [test_param_1];
+; CHECKPTX71-NEXT:    ld.param.b32 %r16, [test_param_0];
+; CHECKPTX71-NEXT:    and.b32 %r1, %r16, -4;
+; CHECKPTX71-NEXT:    and.b32 %r17, %r16, 3;
+; CHECKPTX71-NEXT:    shl.b32 %r2, %r17, 3;
+; CHECKPTX71-NEXT:    mov.b32 %r18, 65535;
+; CHECKPTX71-NEXT:    shl.b32 %r19, %r18, %r2;
+; CHECKPTX71-NEXT:    not.b32 %r3, %r19;
+; CHECKPTX71-NEXT:    ld.b32 %r46, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r28, %r54, %r2;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r28;
+; CHECKPTX71-NEXT:    shr.u32 %r20, %r46, %r2;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r20;
 ; CHECKPTX71-NEXT:    mov.b16 %rs3, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r29, %rs4;
-; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
-; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p1, %r6, %r54;
-; CHECKPTX71-NEXT:    mov.b32 %r54, %r6;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r21, %rs4;
+; CHECKPTX71-NEXT:    shl.b32 %r22, %r21, %r2;
+; CHECKPTX71-NEXT:    and.b32 %r23, %r46, %r3;
+; CHECKPTX71-NEXT:    or.b32 %r24, %r23, %r22;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p1, %r4, %r46;
+; CHECKPTX71-NEXT:    mov.b32 %r46, %r4;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end44
-; CHECKPTX71-NEXT:    ld.b32 %r55, [%r1];
+; CHECKPTX71-NEXT:    ld.b32 %r47, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r33, %r55, %r2;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs5, %r33;
+; CHECKPTX71-NEXT:    shr.u32 %r25, %r47, %r2;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs5, %r25;
 ; CHECKPTX71-NEXT:    mov.b16 %rs6, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs7;
-; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
-; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
-; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p2, %r9, %r55;
-; CHECKPTX71-NEXT:    mov.b32 %r55, %r9;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r26, %rs7;
+; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX71-NEXT:    and.b32 %r28, %r47, %r3;
+; CHECKPTX71-NEXT:    or.b32 %r29, %r28, %r27;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p2, %r5, %r47;
+; CHECKPTX71-NEXT:    mov.b32 %r47, %r5;
 ; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
 ; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end26
-; CHECKPTX71-NEXT:    and.b32 %r10, %r22, -4;
-; CHECKPTX71-NEXT:    shl.b32 %r38, %r22, 3;
-; CHECKPTX71-NEXT:    and.b32 %r11, %r38, 24;
-; CHECKPTX71-NEXT:    mov.b32 %r39, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r40, %r39, %r11;
-; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
-; CHECKPTX71-NEXT:    ld.global.b32 %r56, [%r10];
+; CHECKPTX71-NEXT:    and.b32 %r6, %r14, -4;
+; CHECKPTX71-NEXT:    shl.b32 %r30, %r14, 3;
+; CHECKPTX71-NEXT:    and.b32 %r7, %r30, 24;
+; CHECKPTX71-NEXT:    mov.b32 %r31, 65535;
+; CHECKPTX71-NEXT:    shl.b32 %r32, %r31, %r7;
+; CHECKPTX71-NEXT:    not.b32 %r8, %r32;
+; CHECKPTX71-NEXT:    ld.global.b32 %r48, [%r6];
 ; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r41, %r56, %r11;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs8, %r41;
+; CHECKPTX71-NEXT:    shr.u32 %r33, %r48, %r7;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs8, %r33;
 ; CHECKPTX71-NEXT:    mov.b16 %rs9, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs10;
-; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
-; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
-; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p3, %r15, %r56;
-; CHECKPTX71-NEXT:    mov.b32 %r56, %r15;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs10;
+; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r7;
+; CHECKPTX71-NEXT:    and.b32 %r36, %r48, %r8;
+; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p3, %r9, %r48;
+; CHECKPTX71-NEXT:    mov.b32 %r48, %r9;
 ; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
 ; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end8
-; CHECKPTX71-NEXT:    and.b32 %r16, %r23, -4;
-; CHECKPTX71-NEXT:    shl.b32 %r46, %r23, 3;
-; CHECKPTX71-NEXT:    and.b32 %r17, %r46, 24;
-; CHECKPTX71-NEXT:    mov.b32 %r47, 65535;
-; CHECKPTX71-NEXT:    shl.b32 %r48, %r47, %r17;
-; CHECKPTX71-NEXT:    not.b32 %r18, %r48;
-; CHECKPTX71-NEXT:    ld.shared.b32 %r57, [%r16];
+; CHECKPTX71-NEXT:    and.b32 %r10, %r15, -4;
+; CHECKPTX71-NEXT:    shl.b32 %r38, %r15, 3;
+; CHECKPTX71-NEXT:    and.b32 %r11, %r38, 24;
+; CHECKPTX71-NEXT:    mov.b32 %r39, 65535;
+; CHECKPTX71-NEXT:    shl.b32 %r40, %r39, %r11;
+; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
+; CHECKPTX71-NEXT:    ld.shared.b32 %r49, [%r10];
 ; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    shr.u32 %r49, %r57, %r17;
-; CHECKPTX71-NEXT:    cvt.u16.u32 %rs11, %r49;
+; CHECKPTX71-NEXT:    shr.u32 %r41, %r49, %r11;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs11, %r41;
 ; CHECKPTX71-NEXT:    mov.b16 %rs12, 0x3F80;
 ; CHECKPTX71-NEXT:    fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
-; CHECKPTX71-NEXT:    cvt.u32.u16 %r50, %rs13;
-; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
-; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
-; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT:    atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
-; CHECKPTX71-NEXT:    setp.ne.b32 %p4, %r21, %r57;
-; CHECKPTX71-NEXT:    mov.b32 %r57, %r21;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs13;
+; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
+; CHECKPTX71-NEXT:    and.b32 %r44, %r49, %r12;
+; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
+; CHECKPTX71-NEXT:    setp.ne.b32 %p4, %r13, %r49;
+; CHECKPTX71-NEXT:    mov.b32 %r49, %r13;
 ; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX71-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index 04a58cf22cfc5..6ea02f35e9626 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -425,40 +425,40 @@ define half @atomicrmw_add_f...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/154391


More information about the llvm-commits mailing list