[llvm] 50c7504 - [NVPTX] Avoid temp copy of byval kernel parameters.

Mon Mar 15 14:28:29 PDT 2021

Author: Artem Belevich
Date: 2021-03-15T14:27:22-07:00
New Revision: 50c7504a93fdb90c26870db8c8ea7add895c7725

URL: https://github.com/llvm/llvm-project/commit/50c7504a93fdb90c26870db8c8ea7add895c7725
DIFF: https://github.com/llvm/llvm-project/commit/50c7504a93fdb90c26870db8c8ea7add895c7725.diff

LOG: [NVPTX] Avoid temp copy of byval kernel parameters.

Avoid making a temporary copy of byval argument if all accesses are loads and
therefore the pointer to the parameter can not escape.

This avoids excessive global memory accesses when each kernel makes its own
copy.

Differential revision: https://reviews.llvm.org/D98469

Added: 
    llvm/test/CodeGen/NVPTX/lower-byval-args.ll

Modified: 
    llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
    llvm/test/CodeGen/NVPTX/lower-args.ll
    llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index fd58ff13788d..56643f64e6c2 100644

--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -140,6 +140,7 @@ INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
 
 // =============================================================================
 // If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
+// and we can't guarantee that the only accesses are loads,
 // then add the following instructions to the first basic block:
 //
 // %temp = alloca %struct.x, align 8
@@ -150,7 +151,57 @@ INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
 // The above code allocates some space in the stack and copies the incoming
 // struct from param space to local space.
 // Then replace all occurrences of %d by %temp.
+//
+// In case we know that all users are GEPs or Loads, replace them with the same
+// ones in parameter AS, so we can access them using ld.param.
 // =============================================================================
+
+// Replaces the \p OldUser instruction with the same in parameter AS.
+// Only Load and GEP are supported.
+static void convertToParamAS(Value *OldUser, Value *Param) {
+  Instruction *I = dyn_cast<Instruction>(OldUser);
+  assert(I && "OldUser must be an instruction");
+  struct IP {
+    Instruction *OldInstruction;
+    Value *NewParam;
+  };
+  SmallVector<IP> ItemsToConvert = {{I, Param}};
+  SmallVector<GetElementPtrInst *> GEPsToDelete;
+  while (!ItemsToConvert.empty()) {
+    IP I = ItemsToConvert.pop_back_val();
+    if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction))
+      LI->setOperand(0, I.NewParam);
+    else if (auto *GEP = dyn_cast<GetElementPtrInst>(I.OldInstruction)) {
+      SmallVector<Value *, 4> Indices(GEP->indices());
+      auto *NewGEP = GetElementPtrInst::Create(nullptr, I.NewParam, Indices,
+                                               GEP->getName(), GEP);
+      NewGEP->setIsInBounds(GEP->isInBounds());
+      llvm::for_each(GEP->users(), [NewGEP, &ItemsToConvert](Value *V) {
+        ItemsToConvert.push_back({cast<Instruction>(V), NewGEP});
+      });
+      GEPsToDelete.push_back(GEP);
+    } else
+      llvm_unreachable("Only Load and GEP can be converted to param AS.");
+  }
+  llvm::for_each(GEPsToDelete,
+                 [](GetElementPtrInst *GEP) { GEP->eraseFromParent(); });
+}
+
+static bool isALoadChain(Value *Start) {
+  SmallVector<Value *, 16> ValuesToCheck = {Start};
+  while (!ValuesToCheck.empty()) {
+    Value *V = ValuesToCheck.pop_back_val();
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return false;
+    if (isa<GetElementPtrInst>(I))
+      ValuesToCheck.append(I->user_begin(), I->user_end());
+    else if (!isa<LoadInst>(I))
+      return false;
+  }
+  return true;
+};
+
 void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   Function *Func = Arg->getParent();
   Instruction *FirstInst = &(Func->getEntryBlock().front());
@@ -159,6 +210,21 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   assert(PType && "Expecting pointer type in handleByValParam");
 
   Type *StructType = PType->getElementType();
+
+  if (llvm::all_of(Arg->users(), isALoadChain)) {
+    // Replace all loads with the loads in param AS. This allows loading the Arg
+    // directly from parameter AS, without making a temporary copy.
+    SmallVector<User *, 16> UsersToUpdate(Arg->users());
+    Value *ArgInParamAS = new AddrSpaceCastInst(
+        Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
+        FirstInst);
+    llvm::for_each(UsersToUpdate, [ArgInParamAS](Value *V) {
+      convertToParamAS(V, ArgInParamAS);
+    });
+    return;
+  }
+
+  // Otherwise we have to create a temporary copy.
   const DataLayout &DL = Func->getParent()->getDataLayout();
   unsigned AS = DL.getAllocaAddrSpace();
   AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);

diff  --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index fbd97b9bd684..078b3fea4c7a 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -23,5 +23,12 @@ entry:
   %arg.idx.val.val = load i32, i32* %arg.idx.val, align 4
   %add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val
   store i32 %add.i, i32* %arg.idx1.val, align 4
+
+  ; let the pointer escape so we still create a local copy this test uses to
+  ; check the load alignment.
+  %tmp = call i32* @escape(i32* nonnull %arg.idx2)
   ret void
 }
+
+; Function Attrs: convergent nounwind
+declare dso_local i32* @escape(i32*) local_unnamed_addr

diff  --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
new file mode 100644
index 000000000000..455eb37e5a17
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.ham = type { [4 x i32] }
+
+; // Verify that load with static offset into parameter is done directly.
+; CHECK-LABEL: .visible .entry static_offset
+; CHECK: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
+; CHECK: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
+; CHECK: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
+; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
+; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_addr1]]+12];
+; CHECK  st.global.u32   [[[result_addr_g]]], [[value]];
+; Function Attrs: nofree norecurse nounwind willreturn mustprogress
+define dso_local void @static_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp eq i32 %arg2, 3
+  br i1 %tmp, label %bb3, label %bb6
+
+bb3:                                              ; preds = %bb
+  %tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 3
+  %tmp5 = load i32, i32* %tmp4, align 4
+  store i32 %tmp5, i32* %arg, align 4
+  br label %bb6
+
+bb6:                                              ; preds = %bb3, %bb
+  ret void
+}
+
+; // Verify that load with dynamic offset into parameter is also done directly.
+; CHECK-LABEL: .visible .entry dynamic_offset
+; CHECK: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
+; CHECK: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
+; CHECK: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
+; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
+; CHECK: add.s64         %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
+; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_w_offset]]];
+; CHECK  st.global.u32   [[[result_addr_g]]], [[value]];
+
+; Function Attrs: nofree norecurse nounwind willreturn mustprogress
+define dso_local void @dynamic_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+bb:
+  %tmp = sext i32 %arg2 to i64
+  %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
+  %tmp4 = load i32, i32* %tmp3, align 4
+  store i32 %tmp4, i32* %arg, align 4
+  ret void
+}
+
+; Verify that if the pointer escapes, then we do fall back onto using a temp copy.
+; CHECK-LABEL: .visible .entry pointer_escapes
+; CHECK: .local .align 8 .b8     __local_depot{{.*}}
+; CHECK: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
+; CHECK: add.u64         %[[copy_addr:rd[0-9]+]], %SPL, 0;
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+12];
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+8];
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+4];
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1];
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]+12],
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]+8],
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]+4],
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]],
+; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
+; CHECK: add.s64         %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]],
+; CHECK: ld.local.u32    [[value:%r[0-9]+]], [%[[copy_w_offset]]];
+; CHECK  st.global.u32   [[[result_addr_g]]], [[value]];
+
+; Function Attrs: convergent norecurse nounwind mustprogress
+define dso_local void @pointer_escapes(i32* nocapture %arg, %struct.ham* byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
+bb:
+  %tmp = sext i32 %arg2 to i64
+  %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
+  %tmp4 = load i32, i32* %tmp3, align 4
+  store i32 %tmp4, i32* %arg, align 4
+  %tmp5 = call i32* @escape(i32* nonnull %tmp3) #3
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare dso_local i32* @escape(i32*) local_unnamed_addr
+
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (i32*, %struct.ham*, i32)* @static_offset, !"kernel", i32 1}
+!4 = !{void (i32*, %struct.ham*, i32)* @dynamic_offset, !"kernel", i32 1}
+!5 = !{void (i32*, %struct.ham*, i32)* @pointer_escapes, !"kernel", i32 1}

diff  --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
index 3430f1eecadc..5632ddc83a0f 100644
--- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -35,7 +35,7 @@ define void @ptr_in_byval_kernel(%struct.S* byval(%struct.S) %input, i32* %outpu
 ; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
 ; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
   %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
-  %b = load i32*, i32** %b_ptr, align 4
+  %b = load i32*, i32** %b_ptr, align 8
   %v = load i32, i32* %b, align 4
 ; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
   store i32 %v, i32* %output, align 4
@@ -51,7 +51,7 @@ define void @ptr_in_byval_func(%struct.S* byval(%struct.S) %input, i32* %output)
 ; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_func_param_1]
 ; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
   %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
-  %b = load i32*, i32** %b_ptr, align 4
+  %b = load i32*, i32** %b_ptr, align 8
   %v = load i32, i32* %b, align 4
 ; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
   store i32 %v, i32* %output, align 4