[llvm] [NVPTX] Convert calls to indirect when call signature mismatches function signature (PR #107644)

Kevin McAfee via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 11 11:58:15 PDT 2024


https://github.com/kalxr updated https://github.com/llvm/llvm-project/pull/107644

>From c2392b92b601f668558bd07eea153adce574d3c3 Mon Sep 17 00:00:00 2001
From: Kevin McAfee <kmcafee at nvidia.com>
Date: Fri, 30 Aug 2024 11:55:14 -0700
Subject: [PATCH 1/3] [NVPTX] Convert calls to indirect when call signature
 mismatches function signature

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   | 47 +++++++++-
 llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll |  4 +-
 .../CodeGen/NVPTX/convert-call-to-indirect.ll | 89 +++++++++++++++++++
 3 files changed, 137 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5c5766a8b23455..2be9b76ba9fc96 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1657,6 +1657,33 @@ LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
   return RetVal;
 }
 
+static bool shouldConvertToIndirectCall(bool IsVarArg, unsigned ParamCount,
+                                        NVPTXTargetLowering::ArgListTy &Args,
+                                        const CallBase *CB,
+                                        GlobalAddressSDNode *Func) {
+  if (!Func)
+    return false;
+  auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal());
+  if (!CalleeFunc)
+    return false;
+
+  auto ActualReturnType = CalleeFunc->getReturnType();
+  if (CB->getType() != ActualReturnType)
+    return true;
+
+  if (IsVarArg)
+    return false;
+
+  auto ActualNumParams = CalleeFunc->getFunctionType()->getNumParams();
+  if (ParamCount != ActualNumParams)
+    return true;
+  for (const Argument &I : CalleeFunc->args())
+    if (I.getType() != Args[I.getArgNo()].Ty)
+      return true;
+
+  return false;
+}
+
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        SmallVectorImpl<SDValue> &InVals) const {
 
@@ -1971,10 +1998,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                     VADeclareParam->getVTList(), DeclareParamOps);
   }
 
+  // If the param count, type of any param, or return type of the callsite
+  // mismatches with that of the function signature, convert the callsite to an
+  // indirect call.
+  bool ConvertToIndirectCall =
+      shouldConvertToIndirectCall(CLI.IsVarArg, ParamCount, Args, CB, Func);
+
   // Both indirect calls and libcalls have nullptr Func. In order to distinguish
   // between them we must rely on the call site value which is valid for
   // indirect calls but is always null for libcalls.
-  bool isIndirectCall = !Func && CB;
+  bool isIndirectCall = (!Func && CB) || ConvertToIndirectCall;
 
   if (isa<ExternalSymbolSDNode>(Callee)) {
     Function* CalleeFunc = nullptr;
@@ -2026,6 +2059,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
   InGlue = Chain.getValue(1);
 
+  if (ConvertToIndirectCall) {
+    // Copy the function ptr to a ptx register and use the register to call the
+    // function.
+    EVT DestVT = Callee.getValueType();
+    MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned DestReg =
+        RegInfo.createVirtualRegister(TLI.getRegClassFor(DestVT.getSimpleVT()));
+    auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee);
+    Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);
+  }
+
   // Ops to print out the function name
   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue CallVoidOps[] = { Chain, Callee, InGlue };
diff --git a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
index c5f7bd1bd1ba20..bd723a296e620f 100644
--- a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
@@ -17,8 +17,8 @@ target triple = "nvptx64-nvidia-cuda"
 ; CHECK: st.param.b16   [param2+0], %rs1;
 ; CHECK: st.param.b16   [param2+2], %rs2;
 ; CHECK: .param .align 2 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: _Z20__spirv_GroupCMulKHRjjN5__spv12complex_halfE,
+; CHECK-NEXT: prototype_0 : .callprototype (.param .align 2 .b8 _[4]) _ (.param .b32 _, .param .b32 _, .param .align 2 .b8 _[4]);
+; CHECK-NEXT: call (retval0),
 define weak_odr void @foo() {
 entry:
   %call.i.i.i = tail call %"class.complex" @_Z20__spirv_GroupCMulKHRjjN5__spv12complex_halfE(i32 0, i32 0, ptr byval(%"class.complex") null)
diff --git a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
new file mode 100644
index 00000000000000..2602c3b0d041b5
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
@@ -0,0 +1,89 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 | %ptxas-verify %}
+
+%struct.64 = type <{ i64 }>
+declare i64 @callee(ptr %p);
+declare i64 @callee_variadic(ptr %p, ...);
+
+define %struct.64 @test_return_type_mismatch(ptr %p) {
+; CHECK-LABEL: test_return_type_mismatch(
+; CHECK:         .param .align 1 .b8 retval0[8];
+; CHECK-NEXT:    prototype_0 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_0;
+  %ret = call %struct.64 @callee(ptr %p)
+  ret %struct.64 %ret
+}
+
+define i64 @test_param_type_mismatch(ptr %p) {
+; CHECK-LABEL: test_param_type_mismatch(
+; CHECK:         .param .b64 retval0;
+; CHECK-NEXT:    prototype_1 : .callprototype (.param .b64 _) _ (.param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_1;
+  %ret = call i64 @callee(i64 7)
+  ret i64 %ret
+}
+
+define i64 @test_param_count_mismatch(ptr %p) {
+; CHECK-LABEL: test_param_count_mismatch(
+; CHECK:         .param .b64 retval0;
+; CHECK-NEXT:    prototype_2 : .callprototype (.param .b64 _) _ (.param .b64 _, .param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_2;
+  %ret = call i64 @callee(ptr %p, i64 7)
+  ret i64 %ret
+}
+
+define %struct.64 @test_return_type_mismatch_variadic(ptr %p) {
+; CHECK-LABEL: test_return_type_mismatch_variadic(
+; CHECK:         .param .align 1 .b8 retval0[8];
+; CHECK-NEXT:    prototype_3 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_3;
+  %ret = call %struct.64 (ptr, ...) @callee_variadic(ptr %p)
+  ret %struct.64 %ret
+}
+
+define i64 @test_param_type_mismatch_variadic(ptr %p) {
+; CHECK-LABEL: test_param_type_mismatch_variadic(
+; CHECK:         .param .b64 retval0;
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    callee_variadic
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    )
+  %ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7)
+  ret i64 %ret
+}
+
+define i64 @test_param_count_mismatch_variadic(ptr %p) {
+; CHECK-LABEL: test_param_count_mismatch_variadic(
+; CHECK:         .param .b64 retval0;
+; CHECK-NEXT:    call.uni (retval0),
+; CHECK-NEXT:    callee_variadic
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    )
+  %ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7)
+  ret i64 %ret
+}

>From aaa1d280e5b31df56fa781d05c5ba67df2e12fa0 Mon Sep 17 00:00:00 2001
From: Kevin McAfee <kmcafee at nvidia.com>
Date: Wed, 11 Sep 2024 11:53:56 -0700
Subject: [PATCH 2/3] Refactor to use existing api instead of manual checks for
 types

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 30 +++++----------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2be9b76ba9fc96..fbbaff75d04570 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1657,31 +1657,15 @@ LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
   return RetVal;
 }
 
-static bool shouldConvertToIndirectCall(bool IsVarArg, unsigned ParamCount,
-                                        NVPTXTargetLowering::ArgListTy &Args,
-                                        const CallBase *CB,
-                                        GlobalAddressSDNode *Func) {
+static bool shouldConvertToIndirectCall(const CallBase *CB,
+                                        const GlobalAddressSDNode *Func) {
   if (!Func)
     return false;
   auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal());
   if (!CalleeFunc)
     return false;
 
-  auto ActualReturnType = CalleeFunc->getReturnType();
-  if (CB->getType() != ActualReturnType)
-    return true;
-
-  if (IsVarArg)
-    return false;
-
-  auto ActualNumParams = CalleeFunc->getFunctionType()->getNumParams();
-  if (ParamCount != ActualNumParams)
-    return true;
-  for (const Argument &I : CalleeFunc->args())
-    if (I.getType() != Args[I.getArgNo()].Ty)
-      return true;
-
-  return false;
+  return CB->getFunctionType() != CalleeFunc->getFunctionType();
 }
 
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -1998,11 +1982,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                     VADeclareParam->getVTList(), DeclareParamOps);
   }
 
-  // If the param count, type of any param, or return type of the callsite
-  // mismatches with that of the function signature, convert the callsite to an
-  // indirect call.
-  bool ConvertToIndirectCall =
-      shouldConvertToIndirectCall(CLI.IsVarArg, ParamCount, Args, CB, Func);
+  // If the type of the callsite does not match that of the function, convert
+  // the callsite to an indirect call.
+  bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
 
   // Both indirect calls and libcalls have nullptr Func. In order to distinguish
   // between them we must rely on the call site value which is valid for

>From 6c4a734dfc21e5ef9a09379fb7c806a355023e25 Mon Sep 17 00:00:00 2001
From: Kevin McAfee <kmcafee at nvidia.com>
Date: Wed, 11 Sep 2024 11:54:18 -0700
Subject: [PATCH 3/3] Update test to account for indirect call conversion

---
 .../CodeGen/NVPTX/lower-args-gridconstant.ll  | 110 ++++++++++--------
 1 file changed, 61 insertions(+), 49 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 176dfee11cfb09..b203a78d677308 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -72,21 +72,24 @@ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-LABEL: grid_const_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<3>;
-; PTX-NEXT:    .reg .b64 %rd<4>;
+; PTX-NEXT:    .reg .b64 %rd<5>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    mov.b64 %rd1, grid_const_escape_param_0;
-; PTX-NEXT:    mov.u64 %rd2, %rd1;
-; PTX-NEXT:    cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.b64 %rd2, grid_const_escape_param_0;
+; PTX-NEXT:    mov.u64 %rd3, %rd2;
+; PTX-NEXT:    cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT:    mov.u64 %rd1, escape;
 ; PTX-NEXT:    { // callseq 0, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0+0], %rd3;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd4;
 ; PTX-NEXT:    .param .b32 retval0;
-; PTX-NEXT:    call.uni (retval0),
-; PTX-NEXT:    escape,
+; PTX-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _);
+; PTX-NEXT:    call (retval0),
+; PTX-NEXT:    %rd1,
 ; PTX-NEXT:    (
 ; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    )
+; PTX-NEXT:    , prototype_0;
 ; PTX-NEXT:    ld.param.b32 %r1, [retval0+0];
 ; PTX-NEXT:    } // callseq 0
 ; PTX-NEXT:    ret;
@@ -107,36 +110,39 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
 ; PTX-NEXT:    .reg .b32 %r<4>;
-; PTX-NEXT:    .reg .b64 %rd<9>;
+; PTX-NEXT:    .reg .b64 %rd<10>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.u64 %SPL, __local_depot3;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; PTX-NEXT:    mov.b64 %rd1, multiple_grid_const_escape_param_0;
-; PTX-NEXT:    mov.b64 %rd2, multiple_grid_const_escape_param_2;
-; PTX-NEXT:    mov.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.b64 %rd2, multiple_grid_const_escape_param_0;
+; PTX-NEXT:    mov.b64 %rd3, multiple_grid_const_escape_param_2;
+; PTX-NEXT:    mov.u64 %rd4, %rd3;
 ; PTX-NEXT:    ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
-; PTX-NEXT:    cvta.param.u64 %rd4, %rd3;
-; PTX-NEXT:    mov.u64 %rd5, %rd1;
-; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
-; PTX-NEXT:    add.u64 %rd7, %SP, 0;
-; PTX-NEXT:    add.u64 %rd8, %SPL, 0;
-; PTX-NEXT:    st.local.u32 [%rd8], %r1;
+; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT:    mov.u64 %rd6, %rd2;
+; PTX-NEXT:    cvta.param.u64 %rd7, %rd6;
+; PTX-NEXT:    add.u64 %rd8, %SP, 0;
+; PTX-NEXT:    add.u64 %rd9, %SPL, 0;
+; PTX-NEXT:    st.local.u32 [%rd9], %r1;
+; PTX-NEXT:    mov.u64 %rd1, escape3;
 ; PTX-NEXT:    { // callseq 1, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0+0], %rd6;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd7;
 ; PTX-NEXT:    .param .b64 param1;
-; PTX-NEXT:    st.param.b64 [param1+0], %rd7;
+; PTX-NEXT:    st.param.b64 [param1+0], %rd8;
 ; PTX-NEXT:    .param .b64 param2;
-; PTX-NEXT:    st.param.b64 [param2+0], %rd4;
+; PTX-NEXT:    st.param.b64 [param2+0], %rd5;
 ; PTX-NEXT:    .param .b32 retval0;
-; PTX-NEXT:    call.uni (retval0),
-; PTX-NEXT:    escape3,
+; PTX-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _);
+; PTX-NEXT:    call (retval0),
+; PTX-NEXT:    %rd1,
 ; PTX-NEXT:    (
 ; PTX-NEXT:    param0,
 ; PTX-NEXT:    param1,
 ; PTX-NEXT:    param2
-; PTX-NEXT:    );
+; PTX-NEXT:    )
+; PTX-NEXT:    , prototype_1;
 ; PTX-NEXT:    ld.param.b32 %r2, [retval0+0];
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
@@ -221,26 +227,29 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<5>;
-; PTX-NEXT:    .reg .b64 %rd<6>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    mov.b64 %rd1, grid_const_partial_escape_param_0;
-; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_partial_escape_param_1];
-; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
-; PTX-NEXT:    mov.u64 %rd4, %rd1;
-; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
-; PTX-NEXT:    ld.u32 %r1, [%rd5];
+; PTX-NEXT:    mov.b64 %rd2, grid_const_partial_escape_param_0;
+; PTX-NEXT:    ld.param.u64 %rd3, [grid_const_partial_escape_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; PTX-NEXT:    mov.u64 %rd5, %rd2;
+; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT:    ld.u32 %r1, [%rd6];
 ; PTX-NEXT:    add.s32 %r2, %r1, %r1;
-; PTX-NEXT:    st.global.u32 [%rd3], %r2;
+; PTX-NEXT:    st.global.u32 [%rd4], %r2;
+; PTX-NEXT:    mov.u64 %rd1, escape;
 ; PTX-NEXT:    { // callseq 2, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0+0], %rd5;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd6;
 ; PTX-NEXT:    .param .b32 retval0;
-; PTX-NEXT:    call.uni (retval0),
-; PTX-NEXT:    escape,
+; PTX-NEXT:    prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _);
+; PTX-NEXT:    call (retval0),
+; PTX-NEXT:    %rd1,
 ; PTX-NEXT:    (
 ; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    )
+; PTX-NEXT:    , prototype_2;
 ; PTX-NEXT:    ld.param.b32 %r3, [retval0+0];
 ; PTX-NEXT:    } // callseq 2
 ; PTX-NEXT:    ret;
@@ -266,27 +275,30 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
 ; PTX-LABEL: grid_const_partial_escapemem(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<6>;
-; PTX-NEXT:    .reg .b64 %rd<6>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    mov.b64 %rd1, grid_const_partial_escapemem_param_0;
-; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_partial_escapemem_param_1];
-; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
-; PTX-NEXT:    mov.u64 %rd4, %rd1;
-; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
-; PTX-NEXT:    ld.u32 %r1, [%rd5];
-; PTX-NEXT:    ld.u32 %r2, [%rd5+4];
-; PTX-NEXT:    st.global.u64 [%rd3], %rd5;
+; PTX-NEXT:    mov.b64 %rd2, grid_const_partial_escapemem_param_0;
+; PTX-NEXT:    ld.param.u64 %rd3, [grid_const_partial_escapemem_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; PTX-NEXT:    mov.u64 %rd5, %rd2;
+; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT:    ld.u32 %r1, [%rd6];
+; PTX-NEXT:    ld.u32 %r2, [%rd6+4];
+; PTX-NEXT:    st.global.u64 [%rd4], %rd6;
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
+; PTX-NEXT:    mov.u64 %rd1, escape;
 ; PTX-NEXT:    { // callseq 3, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0+0], %rd5;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd6;
 ; PTX-NEXT:    .param .b32 retval0;
-; PTX-NEXT:    call.uni (retval0),
-; PTX-NEXT:    escape,
+; PTX-NEXT:    prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _);
+; PTX-NEXT:    call (retval0),
+; PTX-NEXT:    %rd1,
 ; PTX-NEXT:    (
 ; PTX-NEXT:    param0
-; PTX-NEXT:    );
+; PTX-NEXT:    )
+; PTX-NEXT:    , prototype_3;
 ; PTX-NEXT:    ld.param.b32 %r4, [retval0+0];
 ; PTX-NEXT:    } // callseq 3
 ; PTX-NEXT:    st.param.b32 [func_retval0+0], %r3;



More information about the llvm-commits mailing list