[polly] r311161 - [GPGPU] Simplify PPCGSCop to reduce compile time [NFC]

Tobias Grosser via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 18 06:38:12 PDT 2017


Author: grosser
Date: Fri Aug 18 06:38:12 2017
New Revision: 311161

URL: http://llvm.org/viewvc/llvm-project?rev=311161&view=rev
Log:
[GPGPU] Simplify PPCGSCop to reduce compile time [NFC]

Summary:
Drop unused parameter dimensions to reduce the size of the sets we are working
with. Especially the computed dependences tend to accumulate a lot of parameters
that are present in the input memory accesses, but often not necessary to
express the actual dependences. As isl represents maps and sets with dense
matrices, reducing the dimensionality of isl sets commonly reduces code
generation performance.

This reduces compile time from 17 to 11 seconds for our test case. While this is
not impressive, this patch helped me to identify the previous two performance
improvements and additionally also increases readability of the isl data
structures we use.

Reviewers: Meinersbur, bollu, singam-sanjay

Reviewed By: bollu

Subscribers: nemanjai, pollydev, llvm-commits, kbarton

Tags: #polly

Differential Revision: https://reviews.llvm.org/D36869

Modified:
    polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
    polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll
    polly/trunk/test/GPGPU/non-zero-array-offset.ll

Modified: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp?rev=311161&r1=311160&r2=311161&view=diff
==============================================================================
--- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp (original)
+++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp Fri Aug 18 06:38:12 2017
@@ -2632,6 +2632,77 @@ public:
     return Names;
   }
 
+  /// Remove unreferenced parameter dimensions from union_map.
+  isl::union_map removeUnusedParameters(isl::union_map UMap) {
+    auto New = isl::union_map::empty(isl::space(UMap.get_ctx(), 0, 0));
+
+    auto RemoveUnusedDims = [&New](isl::map S) -> isl::stat {
+      int Removed = 0;
+      int NumDims = S.dim(isl::dim::param);
+      for (long i = 0; i < NumDims; i++) {
+        const int Dim = i - Removed;
+        if (!S.involves_dims(isl::dim::param, Dim, 1)) {
+          S = S.remove_dims(isl::dim::param, Dim, 1);
+          Removed++;
+        }
+      }
+      New = New.unite(S);
+      return isl::stat::ok;
+    };
+
+    UMap.foreach_map(RemoveUnusedDims);
+    return New;
+  }
+
+  /// Remove unreferenced parameter dimensions from union_set.
+  isl::union_set removeUnusedParameters(isl::union_set USet) {
+    auto New = isl::union_set::empty(isl::space(USet.get_ctx(), 0, 0));
+
+    auto RemoveUnusedDims = [&New](isl::set S) -> isl::stat {
+      int Removed = 0;
+      int NumDims = S.dim(isl::dim::param);
+      for (long i = 0; i < NumDims; i++) {
+        const int Dim = i - Removed;
+        if (!S.involves_dims(isl::dim::param, Dim, 1)) {
+          S = S.remove_dims(isl::dim::param, Dim, 1);
+          Removed++;
+        }
+      }
+      New = New.unite(S);
+      return isl::stat::ok;
+    };
+
+    USet.foreach_set(RemoveUnusedDims);
+    return New;
+  }
+
+  /// Simplify PPCG scop to improve compile time.
+  ///
+  /// We drop unused parameter dimensions to reduce the size of the sets we are
+  /// working with. Especially the computed dependences tend to accumulate a lot
+  /// of parameters that are present in the input memory accesses, but often are
+  /// not necessary to express the actual dependences. As isl represents maps
+  /// and sets with dense matrices, reducing the dimensionality of isl sets
+  /// commonly reduces code generation performance.
+  void simplifyPPCGScop(ppcg_scop *PPCGScop) {
+    PPCGScop->domain =
+        removeUnusedParameters(isl::manage(PPCGScop->domain)).release();
+
+    PPCGScop->dep_forced =
+        removeUnusedParameters(isl::manage(PPCGScop->dep_forced)).release();
+    PPCGScop->dep_false =
+        removeUnusedParameters(isl::manage(PPCGScop->dep_false)).release();
+    PPCGScop->dep_flow =
+        removeUnusedParameters(isl::manage(PPCGScop->dep_flow)).release();
+    PPCGScop->tagged_dep_flow =
+        removeUnusedParameters(isl::manage(PPCGScop->tagged_dep_flow))
+            .release();
+
+    PPCGScop->tagged_dep_order =
+        removeUnusedParameters(isl::manage(PPCGScop->tagged_dep_order))
+            .release();
+  }
+
   /// Create a new PPCG scop from the current scop.
   ///
   /// The PPCG scop is initialized with data from the current polly::Scop. From
@@ -2689,6 +2760,7 @@ public:
     compute_tagger(PPCGScop);
     compute_dependences(PPCGScop);
     eliminate_dead_code(PPCGScop);
+    simplifyPPCGScop(PPCGScop);
 
     return PPCGScop;
   }
@@ -3130,10 +3202,14 @@ public:
 
     isl_schedule *Schedule = get_schedule(PPCGGen);
 
-    int has_permutable = has_any_permutable_node(Schedule);
+    /// Copy to and from device functions may introduce new parameters, which
+    /// must be present in the schedule tree root for code generation. Hence,
+    /// we ensure that all possible parameters are introduced from this point.
+    if (!PollyManagedMemory)
+      Schedule =
+          isl_schedule_align_params(Schedule, S->getFullParamSpace().release());
 
-    Schedule =
-        isl_schedule_align_params(Schedule, S->getFullParamSpace().release());
+    int has_permutable = has_any_permutable_node(Schedule);
 
     if (!has_permutable || has_permutable < 0) {
       Schedule = isl_schedule_free(Schedule);

Modified: polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll?rev=311161&r1=311160&r2=311161&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll (original)
+++ polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll Fri Aug 18 06:38:12 2017
@@ -21,7 +21,7 @@
 ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
 
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B)
+; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A)
 ; KERNEL-NEXT:   entry:
 ; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
@@ -36,7 +36,7 @@
 ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
 
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A)
+; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B)
 ; KERNEL-NEXT:   entry:
 ; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
@@ -47,13 +47,13 @@
 ; KERNEL-NEXT: }
 
 
-; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
+; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
 ; IR-NEXT:  [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
 ; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
 ; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
 ; IR-NEXT:  store i8* [[DATA]], i8** [[SLOT]]
 
-; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
+; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
 ; IR-NEXT:  [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0
 ; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
 ; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*

Modified: polly/trunk/test/GPGPU/non-zero-array-offset.ll
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/test/GPGPU/non-zero-array-offset.ll?rev=311161&r1=311160&r2=311161&view=diff
==============================================================================
--- polly/trunk/test/GPGPU/non-zero-array-offset.ll (original)
+++ polly/trunk/test/GPGPU/non-zero-array-offset.ll Fri Aug 18 06:38:12 2017
@@ -12,14 +12,14 @@
 
 ; CODE:          dim3 k0_dimBlock(8);
 ; CODE-NEXT:     dim3 k0_dimGrid(1);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B);
 ; CODE-NEXT:     cudaCheckKernel();
 ; CODE-NEXT:   }
 
 ; CODE:        {
 ; CODE-NEXT:     dim3 k1_dimBlock(8);
 ; CODE-NEXT:     dim3 k1_dimGrid(1);
-; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_B);
+; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
 ; CODE-NEXT:     cudaCheckKernel();
 ; CODE-NEXT:   }
 
@@ -27,10 +27,10 @@
 ; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost));
 
 ; CODE: # kernel0
-; CODE-NEXT: Stmt_bb11(t0);
+; CODE-NEXT: Stmt_bb3(t0);
 
 ; CODE: # kernel1
-; CODE-NEXT: Stmt_bb3(t0);
+; CODE-NEXT: Stmt_bb11(t0);
 
 ; IR:       %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32)
 ; IR-NEXT:  %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32)




More information about the llvm-commits mailing list