[llvm] [llvm][NVPTX] Don't reorder MIs that construct a PTX function call (PR #116522)

Youngsuk Kim via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 18 12:23:04 PST 2024


https://github.com/JOE1994 updated https://github.com/llvm/llvm-project/pull/116522

>From 1d32026891d804a3deab80653a4d4e5bfea5ccdd Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim at hpe.com>
Date: Sat, 16 Nov 2024 20:25:58 -0600
Subject: [PATCH 1/4] [llvm][NVPTX] Don't reorder MIs that construct a PTX
 function call

With "-enable-misched", MachineScheduler can reorder MIs that must stick together
(in initially set order) to generate legal PTX code for a function call.

When generating PTX code for the attached test (using LLVM before this revision),
the following invalid PTX code is generated:

```
    { // callseq 0, 0
    .param .b64 param0;
    st.param.f64  [param0], %fd1;
    .param .b64 retval0;
    call.uni (retval0),
    cvt.u32.u64   %r20, %rd18;
    mad.lo.s32  %r21, %r7, %r20, 1;
    cvt.rn.f64.s32  %fd4, %r21;
    _FOO,
    (
    param0
    );
    ld.param.f64  %fd2, [retval0];
    add.s32   %r22, %r18, 1;
    cvt.rn.f64.s32  %fd5, %r22;
    } // callseq 0
```
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp     |  20 ++++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.h       |   3 +
 llvm/test/CodeGen/NVPTX/misched_func_call.ll | 108 +++++++++++++++++++
 3 files changed, 131 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/misched_func_call.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index bec40874c89488..9c1ed0d5f5abd9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -202,3 +202,23 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
   BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
   return 2;
 }
+
+bool NVPTXInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                          const MachineBasicBlock *MBB,
+                                          const MachineFunction &MF) const {
+  // Prevent the scheduler from reordering & splitting up MachineInstrs
+  // which must stick together (in initially set order) to
+  // comprise a valid PTX function call sequence.
+  switch (MI.getOpcode()) {
+    case NVPTX::CallUniPrintCallRetInst1:
+    case NVPTX::CallArgBeginInst:
+    case NVPTX::CallArgI32imm:
+    case NVPTX::CallArgParam:
+    case NVPTX::LastCallArgI32imm:
+    case NVPTX::LastCallArgParam:
+    case NVPTX::CallArgEndInst1:
+      return true;
+  }
+
+  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index f674a00bc351bf..a1d9f017120188 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -67,6 +67,9 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
new file mode 100644
index 00000000000000..c54674a9c791f2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -0,0 +1,108 @@
+; RUN: llc -O3 -march=nvptx64 -enable-misched %s -o - | FileCheck %s
+
+; ModuleID = 'The Accel Module'
+source_filename = "The Accel Module"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; Function Attrs: noinline
+define ptx_kernel void @"my_kernel"(i32 %"arg_0", i64 %"arg_1", i64 %"arg_2", i64 %"arg_3") {
+"Entry_BB":
+%r = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+%r6 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+%r7 = mul i32 %r, %r6
+%r9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+%r10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+%r11 = mul i32 %r, %r9
+%r12 = add i32 %r10, %r11
+%"arg_3.tr" = trunc i64 %"arg_3" to i32
+%r16 = shl i32 %"arg_3.tr", 1
+%r19.not = icmp slt i32 %r12, %r16
+br i1 %r19.not, label %"BB1490", label %"EXIT_BB"
+
+"BB1490": ; preds = %"Entry_BB"
+%r23 = sext i32 %"arg_0" to i64
+%r24 = shl nsw i64 %r23, 3
+br label %"BB1692"
+
+"BB1692": ; preds = %"BB18", %"BB1490"
+%"$$i_l40_0_t23.0" = phi i32 [ %r12, %"BB1490" ], [ %r80, %"BB18" ]
+%r28 = sext i32 %"$$i_l40_0_t23.0" to i64
+%0 = or i64 %r28, %"arg_3"
+%1 = and i64 %0, -4294967296
+%2 = icmp eq i64 %1, 0
+br i1 %2, label %3, label %8
+
+3:                                                ; preds = %"BB1692"
+%4 = trunc i64 %"arg_3" to i32
+%5 = trunc i64 %r28 to i32
+%6 = udiv i32 %5, %4
+%7 = zext i32 %6 to i64
+br label %"BB18"
+
+8:                                                ; preds = %"BB1692"
+%9 = sdiv i64 %r28, %"arg_3"
+br label %"BB18"
+
+"BB18": ; preds = %8, %3
+%10 = phi i64 [ %7, %3 ], [ %9, %8 ]
+%r31 = trunc i64 %10 to i32
+%.neg = mul i64 %10, -4294967296
+%r35 = ashr exact i64 %.neg, 32
+%r38 = mul i64 %"arg_3", %r35
+%r39 = add i64 %r28, %r38
+%r42 = mul i64 %r24, %r39
+%r44 = mul i32 %r31, 10
+%r47 = inttoptr i64 %"arg_1" to ptr addrspace(1)
+%gep2 = getelementptr i8, ptr addrspace(1) %r47, i64 %r42
+%11 = sext i32 %r44 to i64
+%r53 = getelementptr double, ptr addrspace(1) %gep2, i64 %11
+%r54 = load double, ptr addrspace(1) %r53, align 8
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: _FOO,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+%r55 = tail call double @_FOO(double %r54)
+%12 = trunc i64 %r39 to i32
+%r59 = mul i32 %"arg_0", %12
+%r60 = add i32 %r59, 1
+%r61 = sitofp i32 %r60 to double
+%r65 = add i32 %r31, 1
+%r66 = sitofp i32 %r65 to double
+%r67 = tail call double @llvm.fma.f64(double %r55, double %r55, double %r66)
+%r68 = fadd double %r67, %r61
+%r71 = inttoptr i64 %"arg_2" to ptr addrspace(1)
+%gep88 = getelementptr i8, ptr addrspace(1) %r71, i64 %r42
+%r77 = getelementptr double, ptr addrspace(1) %gep88, i64 %11
+store double %r68, ptr addrspace(1) %r77, align 8
+%r80 = add i32 %r7, %"$$i_l40_0_t23.0"
+%r85 = icmp slt i32 %r80, %r16
+br i1 %r85, label %"BB1692", label %"EXIT_BB"
+
+"EXIT_BB": ; preds = %"BB18", %"Entry_BB"
+ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+declare double @_FOO(double)
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare double @llvm.fma.f64(double, double, double) #1
+
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.module.flags = !{ !2}
+
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}

>From e074ec7a23671755ee1b16bcd49335dc08838be2 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim at hpe.com>
Date: Sat, 16 Nov 2024 21:00:02 -0600
Subject: [PATCH 2/4] Apply clang-format

---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 9c1ed0d5f5abd9..d261f98b080105 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -210,14 +210,14 @@ bool NVPTXInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   // which must stick together (in initially set order) to
   // comprise a valid PTX function call sequence.
   switch (MI.getOpcode()) {
-    case NVPTX::CallUniPrintCallRetInst1:
-    case NVPTX::CallArgBeginInst:
-    case NVPTX::CallArgI32imm:
-    case NVPTX::CallArgParam:
-    case NVPTX::LastCallArgI32imm:
-    case NVPTX::LastCallArgParam:
-    case NVPTX::CallArgEndInst1:
-      return true;
+  case NVPTX::CallUniPrintCallRetInst1:
+  case NVPTX::CallArgBeginInst:
+  case NVPTX::CallArgI32imm:
+  case NVPTX::CallArgParam:
+  case NVPTX::LastCallArgI32imm:
+  case NVPTX::LastCallArgParam:
+  case NVPTX::CallArgEndInst1:
+    return true;
   }
 
   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);

>From 557a6a57f5fbab131f1655950a9f842967bf34c8 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim at hpe.com>
Date: Mon, 18 Nov 2024 10:08:40 -0600
Subject: [PATCH 3/4] Use llvm-reduce to minimize test size

---
 llvm/test/CodeGen/NVPTX/misched_func_call.ll | 120 ++++---------------
 1 file changed, 20 insertions(+), 100 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index c54674a9c791f2..1ad00e7c37cd9d 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -1,108 +1,28 @@
 ; RUN: llc -O3 -march=nvptx64 -enable-misched %s -o - | FileCheck %s
 
-; ModuleID = 'The Accel Module'
-source_filename = "The Accel Module"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; Function Attrs: noinline
-define ptx_kernel void @"my_kernel"(i32 %"arg_0", i64 %"arg_1", i64 %"arg_2", i64 %"arg_3") {
-"Entry_BB":
-%r = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-%r6 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
-%r7 = mul i32 %r, %r6
-%r9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-%r10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-%r11 = mul i32 %r, %r9
-%r12 = add i32 %r10, %r11
-%"arg_3.tr" = trunc i64 %"arg_3" to i32
-%r16 = shl i32 %"arg_3.tr", 1
-%r19.not = icmp slt i32 %r12, %r16
-br i1 %r19.not, label %"BB1490", label %"EXIT_BB"
-
-"BB1490": ; preds = %"Entry_BB"
-%r23 = sext i32 %"arg_0" to i64
-%r24 = shl nsw i64 %r23, 3
-br label %"BB1692"
-
-"BB1692": ; preds = %"BB18", %"BB1490"
-%"$$i_l40_0_t23.0" = phi i32 [ %r12, %"BB1490" ], [ %r80, %"BB18" ]
-%r28 = sext i32 %"$$i_l40_0_t23.0" to i64
-%0 = or i64 %r28, %"arg_3"
-%1 = and i64 %0, -4294967296
-%2 = icmp eq i64 %1, 0
-br i1 %2, label %3, label %8
-
-3:                                                ; preds = %"BB1692"
-%4 = trunc i64 %"arg_3" to i32
-%5 = trunc i64 %r28 to i32
-%6 = udiv i32 %5, %4
-%7 = zext i32 %6 to i64
-br label %"BB18"
-
-8:                                                ; preds = %"BB1692"
-%9 = sdiv i64 %r28, %"arg_3"
-br label %"BB18"
-
-"BB18": ; preds = %8, %3
-%10 = phi i64 [ %7, %3 ], [ %9, %8 ]
-%r31 = trunc i64 %10 to i32
-%.neg = mul i64 %10, -4294967296
-%r35 = ashr exact i64 %.neg, 32
-%r38 = mul i64 %"arg_3", %r35
-%r39 = add i64 %r28, %r38
-%r42 = mul i64 %r24, %r39
-%r44 = mul i32 %r31, 10
-%r47 = inttoptr i64 %"arg_1" to ptr addrspace(1)
-%gep2 = getelementptr i8, ptr addrspace(1) %r47, i64 %r42
-%11 = sext i32 %r44 to i64
-%r53 = getelementptr double, ptr addrspace(1) %gep2, i64 %11
-%r54 = load double, ptr addrspace(1) %r53, align 8
-; CHECK:      call.uni (retval0),
-; CHECK-NEXT: _FOO,
-; CHECK-NEXT: (
-; CHECK-NEXT: param0
-; CHECK-NEXT: );
-%r55 = tail call double @_FOO(double %r54)
-%12 = trunc i64 %r39 to i32
-%r59 = mul i32 %"arg_0", %12
-%r60 = add i32 %r59, 1
-%r61 = sitofp i32 %r60 to double
-%r65 = add i32 %r31, 1
-%r66 = sitofp i32 %r65 to double
-%r67 = tail call double @llvm.fma.f64(double %r55, double %r55, double %r66)
-%r68 = fadd double %r67, %r61
-%r71 = inttoptr i64 %"arg_2" to ptr addrspace(1)
-%gep88 = getelementptr i8, ptr addrspace(1) %r71, i64 %r42
-%r77 = getelementptr double, ptr addrspace(1) %gep88, i64 %11
-store double %r68, ptr addrspace(1) %r77, align 8
-%r80 = add i32 %r7, %"$$i_l40_0_t23.0"
-%r85 = icmp slt i32 %r80, %r16
-br i1 %r85, label %"BB1692", label %"EXIT_BB"
-
-"EXIT_BB": ; preds = %"BB18", %"Entry_BB"
-ret void
+define ptx_kernel void @my_kernel(i32 %arg_0, i32 %arg_3.tr, i32 %"$$i_l40_0_t23.0") {
+Entry_BB:
+  br label %BB1692
+
+BB1692:                                           ; preds = %BB1692, %Entry_BB
+  %"$$i_l40_0_t23.02" = phi i32 [ 0, %Entry_BB ], [ 1, %BB1692 ]
+  ; CHECK:      call.uni (retval0),
+  ; CHECK-NEXT: _FOO,
+  ; CHECK-NEXT: (
+  ; CHECK-NEXT: param0
+  ; CHECK-NEXT: );
+  %r55 = tail call double @_FOO(double 0.000000e+00)
+  %0 = mul i32 %"$$i_l40_0_t23.02", %arg_3.tr
+  %1 = or i32 %"$$i_l40_0_t23.0", %0
+  %r59 = mul i32 %arg_0, %1
+  %r61 = sitofp i32 %r59 to double
+  %r66 = uitofp i32 %"$$i_l40_0_t23.02" to double
+  %r68 = fadd double %r66, %r61
+  store double %r68, ptr addrspace(1) null, align 8
+  br label %BB1692
 }
 
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
-
 declare double @_FOO(double)
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare double @llvm.fma.f64(double, double, double) #1
-
-attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-
-!llvm.module.flags = !{ !2}
-
-!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}

>From 91b256e1a3c638d10cdbb8c019db0acbd33e79a6 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim at hpe.com>
Date: Mon, 18 Nov 2024 14:18:58 -0600
Subject: [PATCH 4/4] apply instnamer pass

---
 llvm/test/CodeGen/NVPTX/misched_func_call.ll | 32 ++++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index 1ad00e7c37cd9d..d5e370990062d9 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -3,26 +3,26 @@
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define ptx_kernel void @my_kernel(i32 %arg_0, i32 %arg_3.tr, i32 %"$$i_l40_0_t23.0") {
-Entry_BB:
-  br label %BB1692
+define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
+bb:
+  br label %bb3
 
-BB1692:                                           ; preds = %BB1692, %Entry_BB
-  %"$$i_l40_0_t23.02" = phi i32 [ 0, %Entry_BB ], [ 1, %BB1692 ]
+bb3:                                              ; preds = %bb3, %bb
+  %phi = phi i32 [ 0, %bb ], [ 1, %bb3 ]
   ; CHECK:      call.uni (retval0),
-  ; CHECK-NEXT: _FOO,
+  ; CHECK-NEXT: quux,
   ; CHECK-NEXT: (
   ; CHECK-NEXT: param0
   ; CHECK-NEXT: );
-  %r55 = tail call double @_FOO(double 0.000000e+00)
-  %0 = mul i32 %"$$i_l40_0_t23.02", %arg_3.tr
-  %1 = or i32 %"$$i_l40_0_t23.0", %0
-  %r59 = mul i32 %arg_0, %1
-  %r61 = sitofp i32 %r59 to double
-  %r66 = uitofp i32 %"$$i_l40_0_t23.02" to double
-  %r68 = fadd double %r66, %r61
-  store double %r68, ptr addrspace(1) null, align 8
-  br label %BB1692
+  %call = tail call double @quux(double 0.000000e+00)
+  %mul = mul i32 %phi, %arg1
+  %or = or i32 %arg2, %mul
+  %mul4 = mul i32 %arg, %or
+  %sitofp = sitofp i32 %mul4 to double
+  %uitofp = uitofp i32 %phi to double
+  %fadd = fadd double %uitofp, %sitofp
+  store double %fadd, ptr addrspace(1) null, align 8
+  br label %bb3
 }
 
-declare double @_FOO(double)
+declare double @quux(double)



More information about the llvm-commits mailing list