<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Poor performance of OpenMP distribute construct"

   href="https://bugs.llvm.org/show_bug.cgi?id=43998">43998</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Poor performance of OpenMP distribute construct

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>OpenMP

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>unspecified

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Clang Compiler Support

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedclangbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>csdaley@lbl.gov

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>The OpenMP distribute construct performs significantly worse than manually

dividing loop iterations between thread teams. Please see the test program

below which shows the performance of both methods on a system with Intel

Skylake CPUs and NVIDIA V100 GPUs. The performance difference is ~700x. I am

using LLVM/Clang  from Nov 11 2019, although there is the same issue when using

LLVM/Clang from Aug 28 2019.

$ make

clang++ -std=c++11 -Ofast -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -o

test.exe test.cpp

$ srun -n 1 ./test.exe 

Number of sites = 1048576

Executing 100 iterations

Time w/distribute = 2.087 seconds

Time workaround   = 0.003 seconds

$ cat test.cpp

#include <stdio.h>

#include <stdlib.h>

#include <omp.h>

#include <chrono>

typedef std::chrono::system_clock Clock;

#define ITERATIONS 100

#define TOTAL_SITES 1048576

int main(int argc, char *argv[])

{

  int total_sites = TOTAL_SITES;

  printf("Number of sites = %d\n", total_sites);

  printf("Executing %d iterations\n", ITERATIONS);

  auto tstart = Clock::now();

  for (int iters=0; iters<ITERATIONS; ++iters) {

    #pragma omp target teams distribute

    for(int i=0; i<total_sites; ++i) {

      ;

    }

  }

  double sec =

std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tstart).count()

/ 1.0E6;

  printf("Time w/distribute = %.3f seconds\n", sec);

  tstart = Clock::now();

  for (int iters=0; iters<ITERATIONS; ++iters) {

    #pragma omp target teams

    {

      int total_teams = omp_get_num_teams();

      int team_id = omp_get_team_num();

      int sites_per_team = (total_sites + total_teams - 1) / total_teams;

      int istart = team_id * sites_per_team;

      if (istart > total_sites) istart = total_sites;

      int iend = istart + sites_per_team;

      if (iend > total_sites) iend = total_sites;

      /* This is the total_sites loop manually chopped up */

      for (int i = istart; i < iend; ++i) {

        ;

      }

    }

  }

  sec =

std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tstart).count()

/ 1.0E6;

  printf("Time workaround   = %.3f seconds\n", sec);

}

The performance of the distribute construct can be improved by reducing the

number of teams using the num_teams clause. However, the performance is never

competitive compared to manually dividing loop iterations between thread teams.

Thanks,

Chris</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>