[llvm] r273313 - [NVPTX] Improve lowering of byval args of device functions.

Tue Jun 21 13:30:26 PDT 2016

Author: tra
Date: Tue Jun 21 15:30:26 2016
New Revision: 273313

URL: http://llvm.org/viewvc/llvm-project?rev=273313&view=rev
Log:
[NVPTX] Improve lowering of byval args of device functions.

Avoid unnecessary spills of such vars to local space on SASS level and
pointer space conversion.

Instead, make a local copy with appropriate addrspacecasts and let
LLVM optimize them away when possible.

This allows loading value of the argument using [symbol+offset]
instead of converting argument to general space pointer and using it
for indexing (which also implicitly converts param space pointer to
local space one on SASS level and triggers copying of argument into
local space in the process).

This reduces call overhead, uses less registers and reduces overall
SASS size by 2-4%.

Differential Review: http://reviews.llvm.org/D21421

Modified:
    llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
    llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp
    llvm/trunk/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
    llvm/trunk/test/CodeGen/NVPTX/bug21465.ll
    llvm/trunk/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll

Modified: llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp?rev=273313&r1=273312&r2=273313&view=diff
==============================================================================

--- llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp Tue Jun 21 15:30:26 2016
@@ -662,6 +662,58 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceC
           TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
       break;
     case ADDRESS_SPACE_PARAM:
+      if (Src.getOpcode() == NVPTXISD::MoveParam) {
+        // TL;DR: addrspacecast MoveParam to param space is a no-op.
+        //
+        // Longer version is that this particular change is both an
+        // optimization and a work-around for a problem in ptxas for
+        // sm_50+.
+        //
+        // Background:
+        // According to PTX ISA v7.5 5.1.6.4: "...whenever a formal
+        // parameter has its address taken within the called function
+        // [..] the parameter will be copied to the stack if
+        // necessary, and so the address will be in the .local state
+        // space and is accessed via ld.local and st.local
+        // instructions."
+        //
+        // Bug part: 'copied to the stack if necessary' part is
+        // currently broken for byval arguments with alignment < 4 for
+        // sm_50+ in all public versions of CUDA. Taking the address of
+        // such an argument results in SASS code that attempts to store
+        // the value using a 32-bit write to an unaligned address.
+        //
+        // Optimization: On top of the overhead of spill-to-stack, we
+        // also convert that address from local to generic space,
+        // which may result in further overhead. All of it is in most
+        // cases unnecessary, as we only need the value of the
+        // variable, and it can be accessed directly by using the
+        // argument symbol. We'll use the address of the local copy if
+        // it's needed (see step (a) below).
+        //
+        // In order for this bit to do its job we need to:
+        //
+        // a) Let LLVM do the spilling and make sure the IR does not
+        // access byval arguments directly. Instead, the argument
+        // pointer (which is in the default address space) is
+        // addrspacecast'ed to param space, and the data it points to
+        // is copied to allocated local space (i.e. we let compiler
+        // spill it, which gives us an opportunity to optimize the
+        // spill away later if nobody needs its address). Then all
+        // uses of arg pointer are replaced with a pointer to local
+        // copy of the arg. All this is done in NVPTXLowerKernelArgs.
+        //
+        // b) LowerFormalArguments() lowers the argument to
+        // NVPTXISD::MoveParam without any space conversion.
+        //
+        // c) And the final step is done by the code below.
+        // It replaces the addrspacecast (MoveParam) from step (a)
+        // with the arg symbol itself. This can then be used for
+        // [symbol + offset] addressing.
+
+        ReplaceNode(N, Src.getOperand(0).getNode());
+        return;
+      }
       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
                          : NVPTX::nvvm_ptr_gen_to_param;
       break;

Modified: llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp?rev=273313&r1=273312&r2=273313&view=diff
==============================================================================
--- llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp Tue Jun 21 15:30:26 2016
@@ -2078,7 +2078,6 @@ SDValue NVPTXTargetLowering::LowerFormal
   SDValue Root = DAG.getRoot();
   std::vector<SDValue> OutChains;
 
-  bool isKernel = llvm::isKernelFunction(*F);
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
@@ -2112,7 +2111,8 @@ SDValue NVPTXTargetLowering::LowerFormal
             theArgs[i],
             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
                                      : nullptr))) {
-      assert(isKernel && "Only kernels can have image/sampler params");
+      assert(llvm::isKernelFunction(*F) &&
+             "Only kernels can have image/sampler params");
       InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
       continue;
     }
@@ -2334,6 +2334,11 @@ SDValue NVPTXTargetLowering::LowerFormal
     // machine instruction fails because TargetExternalSymbol
     // (not lowered) is target dependent, and CopyToReg assumes
     // the source is lowered.
+    //
+    // Byval arguments for regular function are lowered the same way
+    // as for kernels in order to generate better code and work around
+    // a known issue in ptxas. See comments in
+    // NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the gory details.
     EVT ObjectVT = getValueType(DL, Ty);
     assert(ObjectVT == Ins[InsIdx].VT &&
            "Ins type did not match function type");
@@ -2341,14 +2346,7 @@ SDValue NVPTXTargetLowering::LowerFormal
     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
     if (p.getNode())
       p.getNode()->setIROrder(idx + 1);
-    if (isKernel)
-      InVals.push_back(p);
-    else {
-      SDValue p2 = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
-          DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
-      InVals.push_back(p2);
-    }
+    InVals.push_back(p);
   }
 
   // Clang will check explicit VarArg and issue error if any. However, Clang

Modified: llvm/trunk/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp?rev=273313&r1=273312&r2=273313&view=diff
==============================================================================
--- llvm/trunk/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp (original)
+++ llvm/trunk/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp Tue Jun 21 15:30:26 2016
@@ -101,6 +101,11 @@ namespace {
 class NVPTXLowerKernelArgs : public FunctionPass {
   bool runOnFunction(Function &F) override;
 
+  // Kernels and regular device functions treat byval arguments
+  // differently.
+  bool runOnKernelFunction(Function &F);
+  bool runOnDeviceFunction(Function &F);
+
   // handle byval parameters
   void handleByValParam(Argument *Arg);
   // Knowing Ptr must point to the global address space, this function
@@ -192,11 +197,7 @@ void NVPTXLowerKernelArgs::markPointerAs
 // =============================================================================
 // Main function for this pass.
 // =============================================================================
-bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
-  // Skip non-kernels. See the comments at the top of this file.
-  if (!isKernelFunction(F))
-    return false;
-
+bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) {
   if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
     // Mark pointers in byval structs as global.
     for (auto &B : F) {
@@ -228,6 +229,19 @@ bool NVPTXLowerKernelArgs::runOnFunction
   return true;
 }
 
+// See comments in NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the
+// details on why we need to spill byval arguments into local memory.
+bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) {
+  for (Argument &Arg : F.args())
+    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
+      handleByValParam(&Arg);
+  return true;
+}
+
+bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
+  return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
+}
+
 FunctionPass *
 llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
   return new NVPTXLowerKernelArgs(TM);

Modified: llvm/trunk/test/CodeGen/NVPTX/bug21465.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/NVPTX/bug21465.ll?rev=273313&r1=273312&r2=273313&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/NVPTX/bug21465.ll (original)
+++ llvm/trunk/test/CodeGen/NVPTX/bug21465.ll Tue Jun 21 15:30:26 2016
@@ -15,7 +15,7 @@ entry:
   %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
   %0 = load i32, i32* %b, align 4
 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
-; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4]
+; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
   store i32 %0, i32* %output, align 4
 ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]]
   ret void

Modified: llvm/trunk/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll?rev=273313&r1=273312&r2=273313&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll (original)
+++ llvm/trunk/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll Tue Jun 21 15:30:26 2016
@@ -28,20 +28,38 @@ define void @kernel2(float addrspace(1)*
 
 %struct.S = type { i32*, i32* }
 
-define void @ptr_in_byval(%struct.S* byval %input, i32* %output) {
-; CHECK-LABEL: .visible .entry ptr_in_byval(
-; CHECK: cvta.to.global.u64
-; CHECK: cvta.to.global.u64
+define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
+; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
+; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
+; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
+; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
   %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
   %b = load i32*, i32** %b_ptr, align 4
   %v = load i32, i32* %b, align 4
-; CHECK: ld.global.u32
+; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
   store i32 %v, i32* %output, align 4
-; CHECK: st.global.u32
+; CHECK: st.global.u32 [%[[optr_g]]], %[[val]]
+  ret void
+}
+
+; Regular functions lower byval arguments differently. We need to make
+; sure that we're loading byval argument data using [symbol+offset].
+; There's also no assumption that all pointers within are in global space.
+define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .func ptr_in_byval_func(
+; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_func_param_1]
+; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
+  %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
+  %b = load i32*, i32** %b_ptr, align 4
+  %v = load i32, i32* %b, align 4
+; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
+  store i32 %v, i32* %output, align 4
+; CHECK: st.u32 [%[[optr]]], %[[val]]
   ret void
 }
 
 !nvvm.annotations = !{!0, !1, !2}
 !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
 !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1}
-!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1}
+!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}