[Openmp-commits] [openmp] r262535 - Add new OpenMP 4.5 taskloop construct feature

Jonathan Peyton via Openmp-commits openmp-commits at lists.llvm.org
Wed Mar 2 14:47:51 PST 2016


Author: jlpeyton
Date: Wed Mar  2 16:47:51 2016
New Revision: 262535

URL: http://llvm.org/viewvc/llvm-project?rev=262535&view=rev
Log:
Add new OpenMP 4.5 taskloop construct feature

>From the standard: The taskloop construct specifies that the iterations of one
or more associated loops will be executed in parallel using OpenMP tasks. The
iterations are distributed across tasks created by the construct and scheduled
to be executed.

This initial implementation uses a simple linear tasks distribution algorithm.
Later we can add other algorithms to speedup generation of huge number of tasks
(i.e., tree-like tasks generation should be faster).

This needs to be put into the OpenMP runtime library in order for the
compiler team to develop the compiler side of the implementation.

Differential Revision: http://reviews.llvm.org/D17404

Added:
    openmp/trunk/runtime/test/tasking/kmp_taskloop.c   (with props)
Modified:
    openmp/trunk/runtime/src/dllexports
    openmp/trunk/runtime/src/kmp.h
    openmp/trunk/runtime/src/kmp_tasking.c

Modified: openmp/trunk/runtime/src/dllexports
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/dllexports?rev=262535&r1=262534&r2=262535&view=diff
==============================================================================
--- openmp/trunk/runtime/src/dllexports (original)
+++ openmp/trunk/runtime/src/dllexports Wed Mar  2 16:47:51 2016
@@ -393,6 +393,7 @@ kmpc_set_defaults
         __kmpc_doacross_wait                262
         __kmpc_doacross_post                263
         __kmpc_doacross_fini                264
+        __kmpc_taskloop                     266
     %endif
 %endif
 

Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=262535&r1=262534&r2=262535&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Wed Mar  2 16:47:51 2016
@@ -2205,11 +2205,7 @@ struct kmp_taskdata {
 #endif
 #if OMP_41_ENABLED
     kmp_task_team_t *       td_task_team;
-#endif
-#if KMP_HAVE_QUAD
-    _Quad                   td_dummy;             // Align structure 16-byte size since allocated just before kmp_task_t
-#else
-    kmp_uint32              td_dummy[2];
+    kmp_int32               td_size_alloc;        // The size of task structure, including shareds etc.
 #endif
 }; // struct kmp_taskdata
 
@@ -3478,7 +3474,9 @@ KMP_EXPORT int __kmp_get_cancellation_st
 
 KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask );
 KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
-
+KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task, kmp_int32 if_val,
+                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize, void * task_dup );
 #endif
 
 #endif

Modified: openmp/trunk/runtime/src/kmp_tasking.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_tasking.c?rev=262535&r1=262534&r2=262535&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_tasking.c (original)
+++ openmp/trunk/runtime/src/kmp_tasking.c Wed Mar  2 16:47:51 2016
@@ -1000,6 +1000,7 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_
 #if OMP_41_ENABLED
     taskdata->td_flags.proxy           = flags->proxy;
     taskdata->td_task_team         = thread->th.th_task_team;
+    taskdata->td_size_alloc        = shareds_offset + sizeof_shareds;
 #endif
     taskdata->td_flags.tasktype    = TASK_EXPLICIT;
 
@@ -2877,4 +2878,231 @@ void __kmpc_proxy_task_completed_ooo ( k
     KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
 }
 
+//---------------------------------------------------------------------------------
+// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop
+//
+// thread:   allocating thread
+// task_src: pointer to source task to be duplicated
+// returns:  a pointer to the allocated kmp_task_t structure (task).
+kmp_task_t *
+__kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src )
+{
+    kmp_task_t     *task;
+    kmp_taskdata_t *taskdata;
+    kmp_taskdata_t *taskdata_src;
+    kmp_taskdata_t *parent_task = thread->th.th_current_task;
+    size_t shareds_offset;
+    size_t task_size;
+
+    KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) );
+    taskdata_src = KMP_TASK_TO_TASKDATA( task_src );
+    KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task
+    KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT );
+    task_size = taskdata_src->td_size_alloc;
+
+    // Allocate a kmp_taskdata_t block and a kmp_task_t block.
+    KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) );
+    #if USE_FAST_MEMORY
+    taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size );
+    #else
+    taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size );
+    #endif /* USE_FAST_MEMORY */
+    KMP_MEMCPY(taskdata, taskdata_src, task_size);
+
+    task = KMP_TASKDATA_TO_TASK(taskdata);
+
+    // Initialize new task (only specific fields not affected by memcpy)
+    taskdata->td_task_id = KMP_GEN_TASK_ID();
+    if( task->shareds != NULL ) { // need setup shareds pointer
+        shareds_offset = (char*)task_src->shareds - (char*)taskdata_src;
+        task->shareds = &((char*)taskdata)[shareds_offset];
+        KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 );
+    }
+    taskdata->td_alloc_thread = thread;
+    taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
+
+    // Only need to keep track of child task counts if team parallel and tasking not serialized
+    if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) {
+        KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
+        if ( parent_task->td_taskgroup )
+            KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
+        // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
+        if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT )
+            KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
+    }
+
+    KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
+                  thread, taskdata, taskdata->td_parent) );
+#if OMPT_SUPPORT
+    __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine);
+#endif
+    return task;
+}
+
+// Routine optionally generated by th ecompiler for setting the lastprivate flag
+// and calling needed constructors for private/firstprivate objects
+// (used to form taskloop tasks from pattern task)
+typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
+
+//---------------------------------------------------------------------------------
+// __kmp_taskloop_linear: Start tasks of the taskloop linearly
+//
+// loc       Source location information
+// gtid      Global thread ID
+// task      Task with whole loop iteration range
+// lb        Pointer to loop lower bound
+// ub        Pointer to loop upper bound
+// st        Loop stride
+// sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+// grainsize Schedule value if specified
+// task_dup  Tasks duplication routine
+void
+__kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
+                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                int sched, kmp_uint64 grainsize, void *task_dup )
+{
+    p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+    kmp_uint64 tc;
+    kmp_uint64 lower = *lb; // compiler provides global bounds here
+    kmp_uint64 upper = *ub;
+    kmp_uint64 i, num_tasks, extras;
+    kmp_info_t *thread = __kmp_threads[gtid];
+    kmp_taskdata_t *current_task = thread->th.th_current_task;
+    kmp_task_t *next_task;
+    kmp_int32 lastpriv = 0;
+    size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure
+    size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure
+
+    // compute trip count
+    if ( st == 1 ) {   // most common case
+        tc = upper - lower + 1;
+    } else if ( st < 0 ) {
+        tc = (lower - upper) / (-st) + 1;
+    } else {       // st > 0
+        tc = (upper - lower) / st + 1;
+    }
+    if(tc == 0) {
+        // free the pattern task and exit
+        __kmp_task_start( gtid, task, current_task );
+        // do not execute anything for zero-trip loop
+        __kmp_task_finish( gtid, task, current_task );
+        return;
+    }
+
+    // compute num_tasks/grainsize based on the input provided
+    switch( sched ) {
+    case 0: // no schedule clause specified, we can choose the default
+            // let's try to schedule (team_size*10) tasks
+        grainsize = thread->th.th_team_nproc * 10;
+    case 2: // num_tasks provided
+        if( grainsize > tc ) {
+            num_tasks = tc;   // too big num_tasks requested, adjust values
+            grainsize = 1;
+            extras = 0;
+        } else {
+            num_tasks = grainsize;
+            grainsize = tc / num_tasks;
+            extras = tc % num_tasks;
+        }
+        break;
+    case 1: // grainsize provided
+        if( grainsize > tc ) {
+            num_tasks = 1;    // too big grainsize requested, adjust values
+            grainsize = tc;
+            extras = 0;
+        } else {
+            num_tasks = tc / grainsize;
+            grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations
+            extras = tc % num_tasks;
+        }
+        break;
+    default:
+        KMP_ASSERT2(0, "unknown scheduling of taskloop");
+    }
+    KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+    KMP_DEBUG_ASSERT(num_tasks > extras);
+    KMP_DEBUG_ASSERT(num_tasks > 0);
+
+    // Main loop, launch num_tasks tasks, assign grainsize iterations each task
+    for( i = 0; i < num_tasks; ++i ) {
+        kmp_uint64 chunk_minus_1;
+        if( extras == 0 ) {
+            chunk_minus_1 = grainsize - 1;
+        } else {
+            chunk_minus_1 = grainsize;
+            --extras; // first extras iterations get bigger chunk (grainsize+1)
+        }
+        upper = lower + st * chunk_minus_1;
+        if( i == num_tasks - 1 ) {
+            // schedule the last task, set lastprivate flag
+            lastpriv = 1;
+#if KMP_DEBUG
+            if( st == 1 )
+                KMP_DEBUG_ASSERT(upper == *ub);
+            else if( st > 0 )
+                KMP_DEBUG_ASSERT(upper+st > *ub);
+            else
+                KMP_DEBUG_ASSERT(upper+st < *ub);
+#endif
+        }
+        next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+        *(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds
+        *(kmp_uint64*)((char*)next_task + upper_offset) = upper;
+        if( ptask_dup != NULL )
+            ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc.
+        __kmp_omp_task(gtid, next_task, true); // schedule new task
+        lower = upper + st; // adjust lower bound for the next iteration
+    }
+    // free the pattern task and exit
+    __kmp_task_start( gtid, task, current_task );
+    // do not execute the pattern task, just do bookkeeping
+    __kmp_task_finish( gtid, task, current_task );
+}
+
+/*!
+ at ingroup TASKING
+ at param loc       Source location information
+ at param gtid      Global thread ID
+ at param task      Task structure
+ at param if_val    Value of the if clause
+ at param lb        Pointer to loop lower bound
+ at param ub        Pointer to loop upper bound
+ at param st        Loop stride
+ at param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
+ at param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+ at param grainsize Schedule value if specified
+ at param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void
+__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                int nogroup, int sched, kmp_uint64 grainsize, void *task_dup )
+{
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+    KMP_DEBUG_ASSERT( task != NULL );
+
+    KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n",
+        gtid, taskdata, *lb, *ub, st, grainsize, sched));
+
+    // check if clause value first
+    if( if_val == 0 ) { // if(0) specified, mark task as serial
+        taskdata->td_flags.task_serial = 1;
+        taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
+    }
+    if( nogroup == 0 ) {
+        __kmpc_taskgroup( loc, gtid );
+    }
+
+    if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) {
+        __kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup );
+    }
+
+    if( nogroup == 0 ) {
+        __kmpc_end_taskgroup( loc, gtid );
+    }
+    KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
+}
+
 #endif

Added: openmp/trunk/runtime/test/tasking/kmp_taskloop.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_taskloop.c?rev=262535&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_taskloop.c (added)
+++ openmp/trunk/runtime/test/tasking/kmp_taskloop.c Wed Mar  2 16:47:51 2016
@@ -0,0 +1,158 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+#define N 4
+#define GRAIN 10
+#define STRIDE 3
+
+// globals
+int th_counter[N];
+int counter;
+
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+    void* dummy;
+} ident_t;
+
+typedef struct shar {
+    int(*pth_counter)[N];
+    int *pcounter;
+    int *pj;
+} *pshareds;
+
+typedef struct task {
+    pshareds shareds;
+    int(* routine)(int,struct task*);
+    int part_id;
+// privates:
+    unsigned long long lb; // library always uses ULONG
+    unsigned long long ub;
+    int st;
+    int last;
+    int i;
+    int j;
+    int th;
+} *ptask, kmp_task_t;
+
+typedef int(* task_entry_t)( int, ptask );
+
+void
+__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv)
+{
+// setup lastprivate flag
+    task_dst->last = lastpriv;
+// could be constructor calls here...
+}
+
+
+// OpenMP RTL interfaces
+typedef unsigned long long kmp_uint64;
+typedef long long kmp_int64;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void
+__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                int nogroup, int sched, kmp_int64 grainsize, void *task_dup );
+ptask
+__kmpc_omp_task_alloc( ident_t *loc, int gtid, int flags,
+                  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                  task_entry_t task_entry );
+void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs);
+int  __kmpc_global_thread_num(void *id_ref);
+#ifdef __cplusplus
+}
+#endif
+
+
+// User's code
+int task_entry(int gtid, ptask task)
+{
+    pshareds pshar = task->shareds;
+    for( task->i = task->lb; task->i <= (int)task->ub; task->i += task->st ) {
+        task->th = omp_get_thread_num();
+        __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1);
+        __kmpc_atomic_fixed4_add(NULL,gtid,&((*pshar->pth_counter)[task->th]),1);
+        task->j = task->i;
+    }
+    my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks
+    if( task->last ) {
+        *(pshar->pj) = task->j; // lastprivate
+    }
+    return 0;
+}
+
+int main()
+{
+    int i, j, gtid = __kmpc_global_thread_num(NULL);
+    ptask task;
+    pshareds psh;
+    omp_set_dynamic(0);
+    counter = 0;
+    for( i=0; i<N; ++i )
+        th_counter[i] = 0;
+    #pragma omp parallel num_threads(N)
+    {
+      #pragma omp master
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+/*
+ *  This is what the OpenMP runtime calls correspond to:
+    #pragma omp taskloop num_tasks(N) lastprivate(j)
+    for( i=0; i<N*GRAIN*STRIDE-1; i+=STRIDE )
+    {
+        int th = omp_get_thread_num();
+        #pragma omp atomic
+            counter++;
+        #pragma omp atomic
+            th_counter[th]++;
+        j = i;
+    }
+*/
+    task = __kmpc_omp_task_alloc(NULL,gtid,1,sizeof(struct task),sizeof(struct shar),&task_entry);
+    psh = task->shareds;
+    psh->pth_counter = &th_counter;
+    psh->pcounter = &counter;
+    psh->pj = &j;
+    task->lb = 0;
+    task->ub = N*GRAIN*STRIDE-2;
+    task->st = STRIDE;
+
+    __kmpc_taskloop(
+        NULL,             // location
+        gtid,             // gtid
+        task,             // task structure
+        1,                // if clause value
+        &task->lb,        // lower bound
+        &task->ub,        // upper bound
+        STRIDE,           // loop increment
+        0,                // 1 if nogroup specified
+        2,                // schedule type: 0-none, 1-grainsize, 2-num_tasks
+        N,                // schedule value (ignored for type 0)
+        (void*)&__task_dup_entry // tasks duplication routine
+        );
+      } // end master
+    } // end parallel
+// check results
+    if( j != N*GRAIN*STRIDE-STRIDE ) {
+        printf("Error in lastprivate, %d != %d\n",j,N*GRAIN*STRIDE-STRIDE);
+        return 1;
+    }
+    if( counter != N*GRAIN ) {
+        printf("Error, counter %d != %d\n",counter,N*GRAIN);
+        return 1;
+    }
+    for( i=0; i<N; ++i ) {
+        if( th_counter[i] % GRAIN ) {
+            printf("Error, th_counter[%d] = %d\n",i,th_counter[i]);
+            return 1;
+        }
+    }
+    printf("passed\n");
+    return 0;
+}

Propchange: openmp/trunk/runtime/test/tasking/kmp_taskloop.c
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: openmp/trunk/runtime/test/tasking/kmp_taskloop.c
------------------------------------------------------------------------------
    svn:keywords = Author Date Id Rev URL

Propchange: openmp/trunk/runtime/test/tasking/kmp_taskloop.c
------------------------------------------------------------------------------
    svn:mime-type = text/plain




More information about the Openmp-commits mailing list