[Openmp-dev] [libomptarget] Data corruption on target nowait with more than 4 threads

Guilherme Valarini via Openmp-dev openmp-dev at lists.llvm.org
Fri Feb 12 09:39:43 PST 2021


Hello everyone,

I'm having some data corruption issues when using the generic-elf plugin on
the program below (blocked matrix multiplication). I tried to use 3 builds
to test this program: the release branches "release/11.x" and
"release/12.x", and the main branch as well. I observed the following
behavior:

- release/11.x & main: the program works correctly with up to 4 OpenMP
threads (OMP_NUM_THREADS=4), but with any number higher than that the
result of the operation becomes incorrect. I believe that the problem may
also happen with 2-4 threads, but with a lower likelihood to do so (of 500
executions, none have presented the problem);
- release/12.x: the program crashes due to a segfault inside a function
called "__kmp_push_task" from OpenMP runtime regardless of the number of
threads.

The program was compiled with the following command after setting the
environment variables to point to the correct clang build:

"clang++ -fopenmp -fopenmp-targets=x86_64-pc-linux-gnu BlockMatMul.cpp"

Does anyone know if this is an already known problem (e.g. multiple
parallel mappings happening at the same time)? What about
the "__kmp_push_task"?

Thanks for the help,
Guilherme Valarini

Here is the program (sorry I could not come up with a smaller example to
post it here). I have dumped the task graph build by OpenMP in a
dot/graphviz form and it seems to be correct with the indented dependencies
found at the function "BlockMatMul_TargetNowait":

#include <assert.h>
> #include <math.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <vector>
> #include <sys/time.h>
> #include <time.h>
> #include <unistd.h>
> class BlockMatrix {
> private:
>   const int rowsPerBlock;
>   const int colsPerBlock;
>   const long nRows;
>   const long nCols;
>   const int nBlocksPerRow;
>   const int nBlocksPerCol;
>   std::vector<std::vector<float *>> Blocks;
> public:
>   BlockMatrix(const int _rowsPerBlock, const int _colsPerBlock,
>               const long _nRows, const long _nCols)
>       : rowsPerBlock(_rowsPerBlock), colsPerBlock(_colsPerBlock),
> nRows(_nRows),
>         nCols(_nCols), nBlocksPerRow(_nRows / _rowsPerBlock),
>         nBlocksPerCol(_nCols / _colsPerBlock) {
>     Blocks = std::vector<std::vector<float *>>(nBlocksPerCol);
>     for (int i = 0; i < nBlocksPerCol; i++) {
>       std::vector<float *> rowBlocks(nBlocksPerRow);
>       for (int j = 0; j < nBlocksPerRow; j++) {
>         rowBlocks[j] =
>             (float *)calloc(_rowsPerBlock * _colsPerBlock, sizeof(float));
>       }
>       Blocks[i] = rowBlocks;
>     }
>   };
>   ~BlockMatrix() {};
>   // Initialize the BlockMatrix from 2D arrays
>   void Initialize(float *matrix) {
>     for (int i = 0; i < nBlocksPerCol; i++)
>       for (int j = 0; j < nBlocksPerRow; j++) {
>         float *CurrBlock = GetBlock(i, j);
>         for (int ii = 0; ii < colsPerBlock; ++ii)
>           for (int jj = 0; jj < rowsPerBlock; ++jj) {
>             int curri = i * colsPerBlock + ii;
>             int currj = j * rowsPerBlock + jj;
>             CurrBlock[ii + jj * colsPerBlock] = matrix[curri + currj *
> nCols];
>           }
>       }
>   }
>   long Compare(float *matrix) {
>     long fail=0;
>     for (int i = 0; i < nBlocksPerCol; i++)
>       for (int j = 0; j < nBlocksPerRow; j++) {
>         float *CurrBlock = GetBlock(i, j);
>         for (int ii = 0; ii < colsPerBlock; ++ii)
>           for (int jj = 0; jj < rowsPerBlock; ++jj) {
>             int curri = i * colsPerBlock + ii;
>             int currj = j * rowsPerBlock + jj;
>             float m_value = matrix[curri + currj * nCols];
>             float bm_value = CurrBlock[ii + jj  * colsPerBlock];
>             if(bm_value != m_value){
>               fprintf(stdout, "i,j = %d,%d\n", i, j);
>               fprintf(stdout, "BlockMAT[%d][%d] = %f\n", ii, jj, bm_value);
>               fprintf(stdout, "MAT[%d][%d] = %f\n", curri, currj, m_value);
>               fail++;
>             }
>           }
>       }
>     // Print results
>     printf("Non-Matching Block Outputs: %ld\n", fail);
>     return fail;
>   }
>   float *GetBlock(int i, int j) {
>     assert(i < nBlocksPerCol && j < nBlocksPerRow && "Accessing outside
> block");
>     return Blocks[i][j];
>   }
> };
>
> #define BS 256
> #define N 1024
>
> // Initialize matrices.
> void init(float *a, float *b) {
>   int i, j;
>   for (i = 0; i < N; ++i) {
>     for (j = 0; j < N; ++j) {
>       a[i * N + j] = (float)i + j % 100;
>       b[i * N + j] = (float)i + j % 100;
>     }
>   }
> }
> int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix
> &C) {
>   #pragma omp parallel
>   #pragma omp master
>   for (int i = 0; i < N / BS; ++i)
>     for (int j = 0; j < N / BS; ++j) {
>       float *BlockC = C.GetBlock(i, j);
>       for (int k = 0; k < N / BS; ++k) {
>         float *BlockA = A.GetBlock(i, k);
>         float *BlockB = B.GetBlock(k,j);
>         #pragma omp target depend(in: BlockA[0], BlockB[0]) \
>                            depend(inout: BlockC[0]) \
>                            map(to: BlockA[:BS*BS], BlockB[:BS*BS]) \
>                            map(tofrom: BlockC[:BS*BS]) nowait
>         #pragma omp parallel for
>         for(int ii = 0; ii < BS; ii++)
>           for(int jj = 0; jj < BS; jj++) {
>             for(int kk = 0; kk < BS; ++kk)
>               BlockC[ii + jj * BS] += BlockA[ii + kk * BS] * BlockB[kk +
> jj * BS];
>           }
>       }
>     }
>   return 0;
> }
> void Matmul(float *a, float *b, float *c) {
>   for (int i = 0; i < N; ++i) {
>     for (int j = 0; j < N; ++j) {
>       float sum = 0.0;
>       for (int k = 0; k < N; ++k) {
>         sum = sum + a[i * N + k] * b[k * N + j];
>       }
>       c[i * N + j] = sum;
>     }
>   }
> }
> int main(int argc, char *argv[]) {
>   double t_start, t_end;
>   int ret = 0;
>   float *a = (float *)malloc(sizeof(float) * N * N);
>   float *b = (float *)malloc(sizeof(float) * N * N);
>   float *c = (float *)calloc(sizeof(float), N * N);
>   init(a, b);
>   auto BlockedA = BlockMatrix(BS, BS, N, N);
>   BlockedA.Initialize(a);
>   BlockedA.Compare(a);
>   auto BlockedB = BlockMatrix(BS, BS, N, N);
>   BlockedB.Initialize(b);
>   BlockedB.Compare(b);
>   Matmul(a, b, c);
>   auto BlockedC = BlockMatrix(BS, BS, N, N);
>   BlockMatMul_TargetNowait(BlockedA, BlockedB, BlockedC);
>   if(BlockedC.Compare(c) > 0) {
>     // exit code to error if there is any missmatch
>     ret = 1;
>   }
>   free(a);
>   free(b);
>   free(c);
>   return ret;
> }
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/openmp-dev/attachments/20210212/60ae76e3/attachment.html>


More information about the Openmp-dev mailing list