<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - Poor performance of OpenMP distribute construct"
   href="https://bugs.llvm.org/show_bug.cgi?id=43998">43998</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Poor performance of OpenMP distribute construct
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>OpenMP
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>unspecified
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Clang Compiler Support
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedclangbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>csdaley@lbl.gov
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>The OpenMP distribute construct performs significantly worse than manually
dividing loop iterations between thread teams. Please see the test program
below which shows the performance of both methods on a system with Intel
Skylake CPUs and NVIDIA V100 GPUs. The performance difference is ~700x. I am
using LLVM/Clang  from Nov 11 2019, although there is the same issue when using
LLVM/Clang from Aug 28 2019.

$ make
clang++ -std=c++11 -Ofast -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -o
test.exe test.cpp

$ srun -n 1 ./test.exe 
Number of sites = 1048576
Executing 100 iterations
Time w/distribute = 2.087 seconds
Time workaround   = 0.003 seconds

$ cat test.cpp
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#include <chrono>
typedef std::chrono::system_clock Clock;

#define ITERATIONS 100
#define TOTAL_SITES 1048576

int main(int argc, char *argv[])
{
  int total_sites = TOTAL_SITES;
  printf("Number of sites = %d\n", total_sites);
  printf("Executing %d iterations\n", ITERATIONS);

  auto tstart = Clock::now();
  for (int iters=0; iters<ITERATIONS; ++iters) {
    #pragma omp target teams distribute
    for(int i=0; i<total_sites; ++i) {
      ;
    }
  }
  double sec =
std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tstart).count()
/ 1.0E6;
  printf("Time w/distribute = %.3f seconds\n", sec);

  tstart = Clock::now();
  for (int iters=0; iters<ITERATIONS; ++iters) {
    #pragma omp target teams
    {
      int total_teams = omp_get_num_teams();
      int team_id = omp_get_team_num();
      int sites_per_team = (total_sites + total_teams - 1) / total_teams;
      int istart = team_id * sites_per_team;
      if (istart > total_sites) istart = total_sites;
      int iend = istart + sites_per_team;
      if (iend > total_sites) iend = total_sites;

      /* This is the total_sites loop manually chopped up */
      for (int i = istart; i < iend; ++i) {
        ;
      }
    }
  }
  sec =
std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tstart).count()
/ 1.0E6;
  printf("Time workaround   = %.3f seconds\n", sec);
}


The performance of the distribute construct can be improved by reducing the
number of teams using the num_teams clause. However, the performance is never
competitive compared to manually dividing loop iterations between thread teams.

Thanks,
Chris</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>