[llvm] b083340 - [llvm][NVPTX] Don't reorder MIs that construct a PTX function call (#116522)

Mon Nov 18 14:12:23 PST 2024

Author: Youngsuk Kim
Date: 2024-11-18T17:12:19-05:00
New Revision: b083340cb663b6bd785dbd5864e5afd950745e35

URL: https://github.com/llvm/llvm-project/commit/b083340cb663b6bd785dbd5864e5afd950745e35
DIFF: https://github.com/llvm/llvm-project/commit/b083340cb663b6bd785dbd5864e5afd950745e35.diff

LOG: [llvm][NVPTX] Don't reorder MIs that construct a PTX function call (#116522)

With "-enable-misched", MachineScheduler can reorder MIs that must stick
together (in initially set order) to generate legal PTX code for a
function call.

When generating PTX code for the attached test (using LLVM before this
revision), the following invalid PTX code is generated:

```
  { // callseq 0, 0
  .param .b64 param0;
  st.param.f64  [param0], 0d0000000000000000;
  .param .b64 retval0;
  call.uni (retval0),
  mul.lo.s32  %r7, %r10, %r3;
  or.b32    %r8, %r4, %r7;
  mul.lo.s32  %r9, %r2, %r8;
  cvt.rn.f64.s32  %fd3, %r9;
  quux,
  (
  param0
  );
  ld.param.f64  %fd1, [retval0];
  } // callseq 0
```

Added: 
    llvm/test/CodeGen/NVPTX/misched_func_call.ll

Modified: 
    llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
    llvm/lib/Target/NVPTX/NVPTXInstrInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 4661c059d5f783..b4dbe6a0930caa 100644

--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -199,3 +199,23 @@ unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
   BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
   return 2;
 }
+
+bool NVPTXInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                          const MachineBasicBlock *MBB,
+                                          const MachineFunction &MF) const {
+  // Prevent the scheduler from reordering & splitting up MachineInstrs
+  // which must stick together (in initially set order) to
+  // comprise a valid PTX function call sequence.
+  switch (MI.getOpcode()) {
+  case NVPTX::CallUniPrintCallRetInst1:
+  case NVPTX::CallArgBeginInst:
+  case NVPTX::CallArgI32imm:
+  case NVPTX::CallArgParam:
+  case NVPTX::LastCallArgI32imm:
+  case NVPTX::LastCallArgParam:
+  case NVPTX::CallArgEndInst1:
+    return true;
+  }
+
+  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}

diff  --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index f674a00bc351bf..a1d9f017120188 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -67,6 +67,9 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
 };
 
 } // namespace llvm

diff  --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
new file mode 100644
index 00000000000000..e036753ce90306
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O3 -march=nvptx64 -enable-misched %s -o - | FileCheck %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: wombat(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<11>;
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .f64 %fd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %bb
+; CHECK-NEXT:    ld.param.u32 %r4, [wombat_param_2];
+; CHECK-NEXT:    ld.param.u32 %r3, [wombat_param_1];
+; CHECK-NEXT:    ld.param.u32 %r2, [wombat_param_0];
+; CHECK-NEXT:    mov.b32 %r10, 0;
+; CHECK-NEXT:    mov.u64 %rd1, 0;
+; CHECK-NEXT:    mov.b32 %r6, 1;
+; CHECK-NEXT:  $L__BB0_1: // %bb3
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    st.param.f64 [param0], 0d0000000000000000;
+; CHECK-NEXT:    .param .b64 retval0;
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    quux,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    );
+; CHECK-NEXT:    mul.lo.s32 %r7, %r10, %r3;
+; CHECK-NEXT:    or.b32 %r8, %r4, %r7;
+; CHECK-NEXT:    mul.lo.s32 %r9, %r2, %r8;
+; CHECK-NEXT:    cvt.rn.f64.s32 %fd3, %r9;
+; CHECK-NEXT:    ld.param.f64 %fd1, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    cvt.rn.f64.u32 %fd4, %r10;
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, %fd3;
+; CHECK-NEXT:    st.global.f64 [%rd1], %fd5;
+; CHECK-NEXT:    mov.u32 %r10, %r6;
+; CHECK-NEXT:    bra.uni $L__BB0_1;
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %bb
+  %phi = phi i32 [ 0, %bb ], [ 1, %bb3 ]
+  %call = tail call double @quux(double 0.000000e+00)
+  %mul = mul i32 %phi, %arg1
+  %or = or i32 %arg2, %mul
+  %mul4 = mul i32 %arg, %or
+  %sitofp = sitofp i32 %mul4 to double
+  %uitofp = uitofp i32 %phi to double
+  %fadd = fadd double %uitofp, %sitofp
+  store double %fadd, ptr addrspace(1) null, align 8
+  br label %bb3
+}
+
+declare double @quux(double)