[llvm-branch-commits] [clang] ad20866 - [OpenMP][NVPTX] Take functions in `deviceRTLs` as `convergent`

Wed Feb 3 19:35:36 PST 2021

Author: Shilei Tian
Date: 2021-02-03T19:35:04-08:00
New Revision: ad2086658df181369a09ad69dac260a41dbab814

URL: https://github.com/llvm/llvm-project/commit/ad2086658df181369a09ad69dac260a41dbab814
DIFF: https://github.com/llvm/llvm-project/commit/ad2086658df181369a09ad69dac260a41dbab814.diff

LOG: [OpenMP][NVPTX] Take functions in `deviceRTLs` as `convergent`

OpenMP device compiler (similar to other SPMD compilers) assumes that
functions are convergent by default to avoid invalid transformations, such as
the bug (https://bugs.llvm.org/show_bug.cgi?id=49021).

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95971

(cherry picked from commit 0f0ce3c12edefd25448e39c4d20718a10d3d42c1)

Added: 
    clang/test/OpenMP/target_attribute_convergent.cpp
    openmp/libomptarget/test/offloading/bug49021.cpp

Modified: 
    clang/lib/Frontend/CompilerInvocation.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d8be4ea14868..036388ebd355 100644

--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2470,6 +2470,8 @@ void CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   bool IsTargetSpecified =
       Opts.OpenMPIsDevice || Args.hasArg(options::OPT_fopenmp_targets_EQ);
 
+  Opts.ConvergentFunctions = Opts.ConvergentFunctions || Opts.OpenMPIsDevice;
+
   if (Opts.OpenMP || Opts.OpenMPSimd) {
     if (int Version = getLastArgIntValue(
             Args, OPT_fopenmp_version_EQ,

diff  --git a/clang/test/OpenMP/target_attribute_convergent.cpp b/clang/test/OpenMP/target_attribute_convergent.cpp
new file mode 100644
index 000000000000..932214e987c8
--- /dev/null
+++ b/clang/test/OpenMP/target_attribute_convergent.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -o - | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -o - | FileCheck %s
+// expected-no-diagnostics
+
+#pragma omp declare target
+
+void foo() {}
+
+#pragma omp end declare target
+
+// CHECK: Function Attrs: {{.*}}convergent{{.*}}
+// CHECK: define hidden void @_Z3foov() [[ATTRIBUTE_NUMBER:#[0-9]+]]
+// CHECK: attributes [[ATTRIBUTE_NUMBER]] = { {{.*}}convergent{{.*}} }

diff  --git a/openmp/libomptarget/test/offloading/bug49021.cpp b/openmp/libomptarget/test/offloading/bug49021.cpp
new file mode 100644
index 000000000000..bcdbf68b10e0
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/bug49021.cpp
@@ -0,0 +1,85 @@
+// RUN: %libomptarget-compilexx-aarch64-unknown-linux-gnu -O3 && %libomptarget-run-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-powerpc64-ibm-linux-gnu -O3 && %libomptarget-run-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-powerpc64le-ibm-linux-gnu -O3 && %libomptarget-run-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-x86_64-pc-linux-gnu -O3 && %libomptarget-run-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-nvptx64-nvidia-cuda -O3 && %libomptarget-run-nvptx64-nvidia-cuda
+
+#include <iostream>
+
+template <typename T> int test_map() {
+  std::cout << "map(complex<>)" << std::endl;
+  T a(0.2), a_check;
+#pragma omp target map(from : a_check)
+  { a_check = a; }
+
+  if (a_check != a) {
+    std::cout << " wrong results";
+    return 1;
+  }
+
+  return 0;
+}
+
+template <typename T> int test_reduction() {
+  std::cout << "flat parallelism" << std::endl;
+  T sum(0), sum_host(0);
+  const int size = 100;
+  T array[size];
+  for (int i = 0; i < size; i++) {
+    array[i] = i;
+    sum_host += array[i];
+  }
+
+#pragma omp target teams distribute parallel for map(to: array[:size])         \
+                                                 reduction(+ : sum)
+  for (int i = 0; i < size; i++)
+    sum += array[i];
+
+  if (sum != sum_host)
+    std::cout << " wrong results " << sum << " host " << sum_host << std::endl;
+
+  std::cout << "hierarchical parallelism" << std::endl;
+  const int nblock(10), block_size(10);
+  T block_sum[nblock];
+#pragma omp target teams distribute map(to                                     \
+                                        : array[:size])                        \
+    map(from                                                                   \
+        : block_sum[:nblock])
+  for (int ib = 0; ib < nblock; ib++) {
+    T partial_sum = 0;
+    const int istart = ib * block_size;
+    const int iend = (ib + 1) * block_size;
+#pragma omp parallel for reduction(+ : partial_sum)
+    for (int i = istart; i < iend; i++)
+      partial_sum += array[i];
+    block_sum[ib] = partial_sum;
+  }
+
+  sum = 0;
+  for (int ib = 0; ib < nblock; ib++) {
+    sum += block_sum[ib];
+  }
+
+  if (sum != sum_host) {
+    std::cout << " wrong results " << sum << " host " << sum_host << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+template <typename T> int test_complex() {
+  int ret = 0;
+  ret |= test_map<T>();
+  ret |= test_reduction<T>();
+  return ret;
+}
+
+int main() {
+  int ret = 0;
+  std::cout << "Testing float" << std::endl;
+  ret |= test_complex<float>();
+  std::cout << "Testing double" << std::endl;
+  ret |= test_complex<double>();
+  return ret;
+}