[Openmp-commits] [openmp] 97d000c - Added API for "masked" construct via two entrypoints: __kmpc_masked,

Fri Mar 5 07:30:17 PST 2021

Author: tlwilmar
Date: 2021-03-05T09:29:57-06:00
New Revision: 97d000cfc6d498ec8f4cde48d75b6d0ddc390c38

URL: https://github.com/llvm/llvm-project/commit/97d000cfc6d498ec8f4cde48d75b6d0ddc390c38
DIFF: https://github.com/llvm/llvm-project/commit/97d000cfc6d498ec8f4cde48d75b6d0ddc390c38.diff

LOG: Added API for "masked" construct via two entrypoints: __kmpc_masked,
and __kmpc_end_masked. The "master" construct is deprecated. Changed
proc-bind keyword from "master" to "primary". Use of both master
construct and master as proc-bind keyword is still allowed, but
deprecated.

Remove references to "master" in comments and strings, and replace
with "primary" or "primary thread". Function names and variables were
not touched, nor were references to deprecated master construct. These
can be updated over time. No new code should refer to master.

Added: 
    

Modified: 
    openmp/runtime/src/dllexports
    openmp/runtime/src/i18n/en_US.txt
    openmp/runtime/src/kmp.h
    openmp/runtime/src/kmp_barrier.cpp
    openmp/runtime/src/kmp_csupport.cpp
    openmp/runtime/src/kmp_dispatch.cpp
    openmp/runtime/src/kmp_dispatch_hier.h
    openmp/runtime/src/kmp_error.cpp
    openmp/runtime/src/kmp_global.cpp
    openmp/runtime/src/kmp_gsupport.cpp
    openmp/runtime/src/kmp_itt.h
    openmp/runtime/src/kmp_itt.inl
    openmp/runtime/src/kmp_omp.h
    openmp/runtime/src/kmp_runtime.cpp
    openmp/runtime/src/kmp_sched.cpp
    openmp/runtime/src/kmp_settings.cpp
    openmp/runtime/src/kmp_stats.cpp
    openmp/runtime/src/kmp_stats.h
    openmp/runtime/src/kmp_tasking.cpp
    openmp/runtime/src/kmp_threadprivate.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 961bf24a9f3b..79bca795d91e 100644

--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -390,6 +390,8 @@ kmpc_set_disp_num_buffers                   267
         __kmpc_taskred_init                 277
         __kmpc_taskred_modifier_init        278
         __kmpc_omp_target_task_alloc        279
+        __kmpc_masked                       282
+        __kmpc_end_masked      	            283
 %endif
 
 # User API entry points that have both lower- and upper- case versions for Fortran.

diff  --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt
index c19165f53e5d..dc33fdbc7ff3 100644
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@@ -138,7 +138,7 @@ SysErr                       "OMP: System error #%1$d: %2$s\n"
 Hint                         "OMP: Hint %1$s\n"
 
 Pragma                       "%1$s pragma (at %2$s:%3$s():%4$s)"
-    # %1 is pragma name (like "parallel" or "master",
+    # %1 is pragma name (like "parallel" or "masked",
     # %2 is file name,
     # %3 is function (routine) name,
     # %4 is the line number (as string, so "s" type specifier should be used).

diff  --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index aa0ed7e12def..75e06f814c74 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -845,7 +845,7 @@ extern char *__kmp_cpuinfo_file;
 typedef enum kmp_proc_bind_t {
   proc_bind_false = 0,
   proc_bind_true,
-  proc_bind_master,
+  proc_bind_primary,
   proc_bind_close,
   proc_bind_spread,
   proc_bind_intel, // use KMP_AFFINITY interface
@@ -1458,7 +1458,8 @@ enum cons_type {
   ct_ordered_in_pdo,
   ct_master,
   ct_reduce,
-  ct_barrier
+  ct_barrier,
+  ct_masked
 };
 
 #define IS_CONS_TYPE_ORDERED(ct) ((ct) == ct_pdo_ordered)
@@ -1606,7 +1607,7 @@ struct private_common {
   struct private_common *next;
   struct private_common *link;
   void *gbl_addr;
-  void *par_addr; /* par_addr == gbl_addr for MASTER thread */
+  void *par_addr; /* par_addr == gbl_addr for PRIMARY thread */
   size_t cmn_size;
 };
 
@@ -2004,9 +2005,9 @@ union KMP_ALIGN_CACHE kmp_barrier_team_union {
     kmp_uint64 b_arrived; /* STATE => task reached synch point. */
 #if USE_DEBUGGER
     // The following two fields are indended for the debugger solely. Only
-    // master of the team accesses these fields: the first one is increased by
-    // 1 when master arrives to a barrier, the second one is increased by one
-    // when all the threads arrived.
+    // primary thread of the team accesses these fields: the first one is
+    // increased by 1 when the primary thread arrives to a barrier, the second
+    // one is increased by one when all the threads arrived.
     kmp_uint b_master_arrived;
     kmp_uint b_team_arrived;
 #endif
@@ -2536,7 +2537,7 @@ typedef struct kmp_teams_size {
 
 // This struct stores a thread that acts as a "root" for a contention
 // group. Contention groups are rooted at kmp_root threads, but also at
-// each master thread of each team created in the teams construct.
+// each primary thread of each team created in the teams construct.
 // This struct therefore also stores a thread_limit associated with
 // that contention group, and a counter to track the number of threads
 // active in that contention group. Each thread has a list of these: CG
@@ -2548,7 +2549,7 @@ typedef struct kmp_teams_size {
 typedef struct kmp_cg_root {
   kmp_info_p *cg_root; // "root" thread for a contention group
   // The CG root's limit comes from OMP_THREAD_LIMIT for root threads, or
-  // thread_limit clause for teams masters
+  // thread_limit clause for teams primary threads
   kmp_int32 cg_thread_limit;
   kmp_int32 cg_nthreads; // Count of active threads in CG rooted at cg_root
   struct kmp_cg_root *up; // pointer to higher level CG root in list
@@ -2558,8 +2559,9 @@ typedef struct kmp_cg_root {
 
 typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* Start with the readonly data which is cache aligned and padded. This is
-     written before the thread starts working by the master. Uber masters may
-     update themselves later. Usage does not consider serialized regions.  */
+     written before the thread starts working by the primary thread. Uber
+     masters may update themselves later. Usage does not consider serialized
+     regions.  */
   kmp_desc_t th_info;
   kmp_team_p *th_team; /* team we belong to */
   kmp_root_p *th_root; /* pointer to root of task hierarchy */
@@ -2570,7 +2572,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* The following are cached from the team info structure */
   /* TODO use these in more places as determined to be needed via profiling */
   int th_team_nproc; /* number of threads in a team */
-  kmp_info_p *th_team_master; /* the team's master thread */
+  kmp_info_p *th_team_master; /* the team's primary thread */
   int th_team_serialized; /* team is serialized */
   microtask_t th_teams_microtask; /* save entry address for teams construct */
   int th_teams_level; /* save initial level of teams construct */
@@ -2591,7 +2593,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
 #endif
   omp_allocator_handle_t th_def_allocator; /* default allocator */
-  /* The data set by the master at reinit, then R/W by the worker */
+  /* The data set by the primary thread at reinit, then R/W by the worker */
   KMP_ALIGN_CACHE int
       th_set_nproc; /* if > 0, then only use this request for the next fork */
 #if KMP_NESTED_HOT_TEAMS
@@ -2627,7 +2629,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   ompt_thread_info_t ompt_thread_info;
 #endif
 
-  /* The following are also read by the master during reinit */
+  /* The following are also read by the primary thread during reinit */
   struct common_table *th_pri_common;
 
   volatile kmp_uint32 th_spin_here; /* thread-local location for spinning */
@@ -2727,7 +2729,7 @@ typedef int (*launch_t)(int gtid);
 
 // Set up how many argv pointers will fit in cache lines containing
 // t_inline_argv. Historically, we have supported at least 96 bytes. Using a
-// larger value for more space between the master write/worker read section and
+// larger value for more space between the primary write/worker read section and
 // read/write by all section seems to buy more performance on EPCC PARALLEL.
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define KMP_INLINE_ARGV_BYTES                                                  \
@@ -2753,11 +2755,11 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   std::atomic<void *> t_tg_reduce_data[2]; // to support task modifier
   std::atomic<int> t_tg_fini_counter[2]; // sync end of task reductions
 
-  // Master only
+  // Primary thread only
   // ---------------------------------------------------------------------------
-  KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team
-  int t_master_this_cons; // "this_construct" single counter of master in parent
-  // team
+  KMP_ALIGN_CACHE int t_master_tid; // tid of primary thread in parent team
+  int t_master_this_cons; // "this_construct" single counter of primary thread
+  // in parent team
   ident_t *t_ident; // if volatile, have to change too much other crud to
   // volatile too
   kmp_team_p *t_parent; // parent team
@@ -2769,7 +2771,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   kmp_uint64 t_region_time; // region begin timestamp
 #endif /* USE_ITT_BUILD */
 
-  // Master write, workers read
+  // Primary thread write, workers read
   // --------------------------------------------------------------------------
   KMP_ALIGN_CACHE void **t_argv;
   int t_argc;
@@ -2805,7 +2807,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   kmp_r_sched_t t_sched; // run-time schedule for the team
 #if KMP_AFFINITY_SUPPORTED
   int t_first_place; // first & last place in parent thread's partition.
-  int t_last_place; // Restore these values to master after par region.
+  int t_last_place; // Restore these values to primary thread after par region.
 #endif // KMP_AFFINITY_SUPPORTED
   int t_display_affinity;
   int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via
@@ -3742,6 +3744,9 @@ KMP_EXPORT void __kmpc_flush(ident_t *);
 KMP_EXPORT void __kmpc_barrier(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_masked(ident_t *, kmp_int32 global_tid,
+                                   kmp_int32 filter);
+KMP_EXPORT void __kmpc_end_masked(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_critical(ident_t *, kmp_int32 global_tid,

diff  --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index 9eb6701162e9..b4fde2aa067d 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -73,9 +73,9 @@ static bool __kmp_linear_barrier_gather_template(
               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
-    // Mark arrival to master thread
+    // Mark arrival to primary thread
     /* After performing this write, a worker thread may not assume that the team
-       is valid any more - it could be deallocated by the master thread at any
+       is valid any more - it could be deallocated by the primary thread at any
        time. */
     ANNOTATE_BARRIER_BEGIN(this_thr);
     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
@@ -166,7 +166,7 @@ static bool __kmp_linear_barrier_release_template(
     KMP_DEBUG_ASSERT(team != NULL);
     other_threads = team->t.t_threads;
 
-    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
+    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
                   "barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
 
@@ -208,7 +208,7 @@ static bool __kmp_linear_barrier_release_template(
         flag.release();
       }
     }
-  } else { // Wait for the MASTER thread to release us
+  } else { // Wait for the PRIMARY thread to release us
     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
     if (cancellable) {
@@ -378,13 +378,13 @@ static void __kmp_tree_barrier_gather(
 
     // Mark arrival to parent thread
     /* After performing this write, a worker thread may not assume that the team
-       is valid any more - it could be deallocated by the master thread at any
+       is valid any more - it could be deallocated by the primary thread at any
        time.  */
     ANNOTATE_BARRIER_BEGIN(this_thr);
     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
     flag.release();
   } else {
-    // Need to update the team arrived pointer if we are the master thread
+    // Need to update the team arrived pointer if we are the primary thread
     if (nproc > 1) // New value was already computed above
       team->t.t_bar[bt].b_arrived = new_state;
     else
@@ -454,7 +454,7 @@ static void __kmp_tree_barrier_release(
   } else {
     team = __kmp_threads[gtid]->th.th_team;
     KMP_DEBUG_ASSERT(team != NULL);
-    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
+    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
                   "barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
   }
@@ -556,7 +556,7 @@ static void __kmp_hyper_barrier_gather(
       // Mark arrival to parent thread
       /* After performing this write (in the last iteration of the enclosing for
          loop), a worker thread may not assume that the team is valid any more
-         - it could be deallocated by the master thread at any time.  */
+         - it could be deallocated by the primary thread at any time.  */
       ANNOTATE_BARRIER_BEGIN(this_thr);
       p_flag.set_waiter(other_threads[parent_tid]);
       p_flag.release();
@@ -614,7 +614,7 @@ static void __kmp_hyper_barrier_gather(
   }
 
   if (KMP_MASTER_TID(tid)) {
-    // Need to update the team arrived pointer if we are the master thread
+    // Need to update the team arrived pointer if we are the primary thread
     if (new_state == KMP_BARRIER_UNUSED_STATE)
       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
     else
@@ -650,14 +650,14 @@ static void __kmp_hyper_barrier_release(
      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
      are released in the reverse order of the corresponding gather, otherwise
      threads are released in the same order. */
-  if (KMP_MASTER_TID(tid)) { // master
+  if (KMP_MASTER_TID(tid)) { // primary thread
     team = __kmp_threads[gtid]->th.th_team;
     KMP_DEBUG_ASSERT(team != NULL);
-    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
+    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
                   "barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
 #if KMP_BARRIER_ICV_PUSH
-    if (propagate_icvs) { // master already has ICVs in final destination; copy
+    if (propagate_icvs) { // primary already has ICVs in final destination; copy
       copy_icvs(&thr_bar->th_fixed_icvs,
                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
     }
@@ -814,15 +814,15 @@ static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
   }
 
   if (uninitialized || team_sz_changed || tid_changed) {
-    thr_bar->my_level = thr_bar->depth - 1; // default for master
-    thr_bar->parent_tid = -1; // default for master
-    if (!KMP_MASTER_TID(
-            tid)) { // if not master, find parent thread in hierarchy
+    thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
+    thr_bar->parent_tid = -1; // default for primary thread
+    if (!KMP_MASTER_TID(tid)) {
+      // if not primary thread, find parent thread in hierarchy
       kmp_uint32 d = 0;
       while (d < thr_bar->depth) { // find parent based on level of thread in
         // hierarchy, and note level
         kmp_uint32 rem;
-        if (d == thr_bar->depth - 2) { // reached level right below the master
+        if (d == thr_bar->depth - 2) { // reached level right below the primary
           thr_bar->parent_tid = 0;
           thr_bar->my_level = d;
           break;
@@ -1007,7 +1007,7 @@ static void __kmp_hierarchical_barrier_gather(
       }
     }
   }
-  // All subordinates are gathered; now release parent if not master thread
+  // All subordinates are gathered; now release parent if not primary thread
 
   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
@@ -1018,7 +1018,7 @@ static void __kmp_hierarchical_barrier_gather(
                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
     /* Mark arrival to parent: After performing this write, a worker thread may
        not assume that the team is valid any more - it could be deallocated by
-       the master thread at any time. */
+       the primary thread at any time. */
     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
       // flag; release it
@@ -1034,7 +1034,7 @@ static void __kmp_hierarchical_barrier_gather(
       flag.set_waiter(other_threads[thr_bar->parent_tid]);
       flag.release();
     }
-  } else { // Master thread needs to update the team's b_arrived value
+  } else { // Primary thread needs to update the team's b_arrived value
     team->t.t_bar[bt].b_arrived = new_state;
     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
                   "arrived(%p) = %llu\n",
@@ -1059,7 +1059,7 @@ static void __kmp_hierarchical_barrier_release(
   if (KMP_MASTER_TID(tid)) {
     team = __kmp_threads[gtid]->th.th_team;
     KMP_DEBUG_ASSERT(team != NULL);
-    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
                   "entered barrier type %d\n",
                   gtid, team->t.t_id, tid, bt));
   } else { // Worker threads
@@ -1137,7 +1137,7 @@ static void __kmp_hierarchical_barrier_release(
     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
                              FALSE);
     if (KMP_MASTER_TID(
-            tid)) { // master already has copy in final destination; copy
+            tid)) { // primary already has copy in final destination; copy
       copy_icvs(&thr_bar->th_fixed_icvs,
                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
@@ -1287,7 +1287,7 @@ template <> struct is_cancellable<false> {
    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
    barrier
    When cancellable = false,
-     Returns 0 if master thread, 1 if worker thread.
+     Returns 0 if primary thread, 1 if worker thread.
    When cancellable = true
      Returns 0 if not cancelled, 1 if cancelled.  */
 template <bool cancellable = false>
@@ -1374,7 +1374,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
 #endif /* USE_ITT_BUILD */
 #if USE_DEBUGGER
     // Let the debugger know: the thread arrived to the barrier and waiting.
-    if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
+    if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
       team->t.t_bar[bt].b_master_arrived += 1;
     } else {
       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
@@ -1442,7 +1442,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
         }
       }
 #if USE_ITT_BUILD
-      /* TODO: In case of split reduction barrier, master thread may send
+      /* TODO: In case of split reduction barrier, primary thread may send
          acquired event early, before the final summation into the shared
          variable is done (final summation can be a long operation for array
          reductions).  */
@@ -1474,7 +1474,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
           break;
         case 3:
           if (__itt_metadata_add_ptr) {
-            // Initialize with master's wait time
+            // Initialize with primary thread's wait time
             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
             // Set arrive time to zero to be able to check it in
             // __kmp_invoke_task(); the same is done inside the loop below
@@ -1594,7 +1594,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
   return status;
 }
 
-// Returns 0 if master thread, 1 if worker thread.
+// Returns 0 if primary thread, 1 if worker thread.
 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
                   size_t reduce_size, void *reduce_data,
                   void (*reduce)(void *, void *)) {
@@ -1612,7 +1612,7 @@ int __kmp_barrier_gomp_cancel(int gtid) {
       int tid = __kmp_tid_from_gtid(gtid);
       kmp_info_t *this_thr = __kmp_threads[gtid];
       if (KMP_MASTER_TID(tid)) {
-        // Master does not need to revert anything
+        // Primary thread does not need to revert anything
       } else {
         // Workers need to revert their private b_arrived flag
         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
@@ -1807,7 +1807,7 @@ void __kmp_join_barrier(int gtid) {
   }
 
   /* From this point on, the team data structure may be deallocated at any time
-     by the master thread - it is unsafe to reference it in any of the worker
+     by the primary thread - it is unsafe to reference it in any of the worker
      threads. Any per-team data items that need to be referenced before the
      end of the barrier should be moved to the kmp_task_team_t structs.  */
   if (KMP_MASTER_TID(tid)) {
@@ -1818,7 +1818,7 @@ void __kmp_join_barrier(int gtid) {
       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
     }
 #if KMP_STATS_ENABLED
-    // Have master thread flag the workers to indicate they are now waiting for
+    // Have primary thread flag the workers to indicate they are now waiting for
     // next parallel region, Also wake them up so they switch their timers to
     // idle.
     for (int i = 0; i < team->t.t_nproc; ++i) {
@@ -1860,7 +1860,7 @@ void __kmp_join_barrier(int gtid) {
         break;
       case 3:
         if (__itt_metadata_add_ptr) {
-          // Initialize with master's wait time
+          // Initialize with primary thread's wait time
           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
           // Set arrive time to zero to be able to check it in
           // __kmp_invoke_task(); the same is done inside the loop below
@@ -1920,7 +1920,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
                 (team != NULL) ? team->t.t_id : -1, tid));
 
-  // th_team pointer only valid for master thread here
+  // th_team pointer only valid for primary thread here
   if (KMP_MASTER_TID(tid)) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
@@ -1956,8 +1956,8 @@ void __kmp_fork_barrier(int gtid, int tid) {
       __kmp_task_team_setup(this_thr, team, 0);
     }
 
-    /* The master thread may have changed its blocktime between the join barrier
-       and the fork barrier. Copy the blocktime info to the thread, where
+    /* The primary thread may have changed its blocktime between join barrier
+       and fork barrier. Copy the blocktime info to the thread, where
        __kmp_wait_template() can access it when the team struct is not
        guaranteed to exist. */
     // See note about the corresponding code in __kmp_join_barrier() being
@@ -1972,7 +1972,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
 #endif
     }
-  } // master
+  } // primary thread
 
   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
   case bp_hyper_bar: {
@@ -2049,25 +2049,25 @@ void __kmp_fork_barrier(int gtid, int tid) {
   }
 
   /* We can now assume that a valid team structure has been allocated by the
-     master and propagated to all worker threads. The current thread, however,
-     may not be part of the team, so we can't blindly assume that the team
-     pointer is non-null.  */
+     primary thread and propagated to all worker threads. The current thread,
+     however, may not be part of the team, so we can't blindly assume that the
+     team pointer is non-null.  */
   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
   KMP_DEBUG_ASSERT(team != NULL);
   tid = __kmp_tid_from_gtid(gtid);
 
 #if KMP_BARRIER_ICV_PULL
-  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
-     __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
+  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
+     __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
      implicit task has this data before this function is called. We cannot
-     modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
-     struct, because it is not always the case that the threads arrays have
-     been allocated when __kmp_fork_call() is executed. */
+     modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
+     thread struct, because it is not always the case that the threads arrays
+     have been allocated when __kmp_fork_call() is executed. */
   {
     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
-    if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
-      // Copy the initial ICVs from the master's thread struct to the implicit
-      // task for this tid.
+    if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
+      // Copy the initial ICVs from the primary thread's thread struct to the
+      // implicit task for this tid.
       KA_TRACE(10,
                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
@@ -2138,13 +2138,13 @@ void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
 
-/* Master thread's copy of the ICVs was set up on the implicit taskdata in
-   __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
+/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
+   __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
    implicit task has this data before this function is called. */
 #if KMP_BARRIER_ICV_PULL
-  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
-     untouched), where all of the worker threads can access them and make their
-     own copies after the barrier. */
+  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
+     remains untouched), where all of the worker threads can access them and
+     make their own copies after the barrier. */
   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
   // allocated at this point
   copy_icvs(
@@ -2158,12 +2158,12 @@ void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
                 team->t.t_threads[0], team));
 #else
-  // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
+  // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
   // time.
   ngo_load(new_icvs);
   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
   // allocated at this point
-  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
+  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
     // TODO: GEH - pass in better source location info since usually NULL here
     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
                   f, team->t.t_threads[f], team));

diff  --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index c5682fb6fcef..4f34f3ac87b0 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -88,7 +88,7 @@ If the runtime has ony been entered at the outermost level from a
 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
 that which would be returned by omp_get_thread_num() in the outermost
 active parallel construct. (Or zero if there is no active parallel
-construct, since the master thread is necessarily thread zero).
+construct, since the primary thread is necessarily thread zero).
 
 If multiple non-OpenMP threads all enter an OpenMP construct then this
 will be a unique thread identifier among all the threads created by
@@ -847,6 +847,92 @@ void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
   }
 }
 
+/*!
+ at ingroup WORK_SHARING
+ at param loc  source location information.
+ at param global_tid  global thread number.
+ at param filter result of evaluating filter clause on thread global_tid, or zero
+if no filter clause present
+ at return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
+*/
+kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
+  int status = 0;
+  int tid;
+  KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  tid = __kmp_tid_from_gtid(global_tid);
+  if (tid == filter) {
+    KMP_COUNT_BLOCK(OMP_MASKED);
+    KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
+    status = 1;
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (status) {
+    if (ompt_enabled.ompt_callback_masked) {
+      kmp_info_t *this_thr = __kmp_threads[global_tid];
+      kmp_team_t *team = this_thr->th.th_team;
+      ompt_callbacks.ompt_callback(ompt_callback_masked)(
+          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
+          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+#if KMP_USE_DYNAMIC_LOCK
+    if (status)
+      __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
+    else
+      __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
+#else
+    if (status)
+      __kmp_push_sync(global_tid, ct_masked, loc, NULL);
+    else
+      __kmp_check_sync(global_tid, ct_masked, loc, NULL);
+#endif
+  }
+
+  return status;
+}
+
+/*!
+ at ingroup WORK_SHARING
+ at param loc  source location information.
+ at param global_tid  global thread number .
+
+Mark the end of a <tt>masked</tt> region. This should only be called by the
+thread that executes the <tt>masked</tt> region.
+*/
+void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
+  KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
+  __kmp_assert_valid_gtid(global_tid);
+  KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  if (ompt_enabled.ompt_callback_masked) {
+    int tid = __kmp_tid_from_gtid(global_tid);
+    ompt_callbacks.ompt_callback(ompt_callback_masked)(
+        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    __kmp_pop_sync(global_tid, ct_masked, loc);
+  }
+}
+
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
@@ -3375,7 +3461,7 @@ __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
 @param reduce_func callback function providing reduction operation on two
 operands and returning result of reduction in lhs_data
 @param lck pointer to the unique lock data structure
- at result 1 for the master thread, 0 for all other team threads, 2 for all team
+ at result 1 for the primary thread, 0 for all other team threads, 2 for all team
 threads if atomic reduction needed
 
 The nowait version is used for a reduce clause with the nowait argument.
@@ -3471,11 +3557,11 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
                                    tree_reduce_block)) {
 
 // AT: performance issue: a real barrier here
-// AT:     (if master goes slow, other threads are blocked here waiting for the
-// master to come and release them)
-// AT:     (it's not what a customer might expect specifying NOWAIT clause)
-// AT:     (specifying NOWAIT won't result in improvement of performance, it'll
-// be confusing to a customer)
+// AT: (if primary thread is slow, other threads are blocked here waiting for
+//      the primary thread to come and release them)
+// AT: (it's not what a customer might expect specifying NOWAIT clause)
+// AT: (specifying NOWAIT won't result in improvement of performance, it'll
+//      be confusing to a customer)
 // AT: another implementation of *barrier_gather*nowait() (or some other design)
 // might go faster and be more in line with sense of NOWAIT
 // AT: TO DO: do epcc test and compare times
@@ -3509,7 +3595,7 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
     }
 #endif
 
-    // all other workers except master should do this pop here
+    // all other workers except primary thread should do this pop here
     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
     if (__kmp_env_consistency_check) {
       if (retval == 0) {
@@ -3567,7 +3653,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
 
   } else if (packed_reduction_method == atomic_reduce_block) {
 
-    // neither master nor other workers should get here
+    // neither primary thread nor other workers should get here
     //     (code gen does not generate this call in case 2: atomic reduce block)
     // actually it's better to remove this elseif at all;
     // after removal this value will checked by the 'else' and will assert
@@ -3575,7 +3661,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
-    // only master gets here
+    // only primary thread gets here
     // OMPT: tree reduction is annotated in the barrier code
 
   } else {
@@ -3605,7 +3691,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
 @param reduce_func callback function providing reduction operation on two
 operands and returning result of reduction in lhs_data
 @param lck pointer to the unique lock data structure
- at result 1 for the master thread, 0 for all other team threads, 2 for all team
+ at result 1 for the primary thread, 0 for all other team threads, 2 for all team
 threads if atomic reduction needed
 
 A blocking reduce that includes an implicit barrier.
@@ -3699,10 +3785,10 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
     }
 #endif
 
-    // all other workers except master should do this pop here
-    // ( none of other workers except master will enter __kmpc_end_reduce() )
+    // all other workers except primary thread should do this pop here
+    // (none of other workers except primary will enter __kmpc_end_reduce())
     if (__kmp_env_consistency_check) {
-      if (retval == 0) { // 0: all other workers; 1: master
+      if (retval == 0) { // 0: all other workers; 1: primary thread
         __kmp_pop_sync(global_tid, ct_reduce, loc);
       }
     }
@@ -3828,7 +3914,7 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
-    // only master executes here (master releases all other workers)
+    // only primary thread executes here (primary releases all other workers)
     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
                             global_tid);
 

diff  --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index 17b6072a532a..d14368243445 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -919,7 +919,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
     }
     // Report loop metadata
     if (itt_need_metadata_reporting) {
-      // Only report metadata by master of active team at level 1
+      // Only report metadata by primary thread of active team at level 1
       kmp_uint64 schedtype = 0;
       switch (schedule) {
       case kmp_sch_static_chunked:

diff  --git a/openmp/runtime/src/kmp_dispatch_hier.h b/openmp/runtime/src/kmp_dispatch_hier.h
index 721c7f678e70..1181970a20ab 100644
--- a/openmp/runtime/src/kmp_dispatch_hier.h
+++ b/openmp/runtime/src/kmp_dispatch_hier.h
@@ -496,7 +496,7 @@ template <typename T> struct kmp_hier_t {
     T hier_id = (T)current->get_hier_id();
     // Attempt to grab next iteration range for this level
     if (previous_id == 0) {
-      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
+      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
                    gtid, hier_level));
       kmp_int32 contains_last;
       T my_lb, my_ub;
@@ -590,7 +590,7 @@ template <typename T> struct kmp_hier_t {
       }
       if (p_last)
         *p_last = contains_last;
-    } // if master thread of this unit
+    } // if primary thread of this unit
     if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
       KD_TRACE(10,
                ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
@@ -740,7 +740,7 @@ template <typename T> struct kmp_hier_t {
                 gtid));
       if (unit_id == 0) {
         // For hand threading, the sh buffer on the lowest level is only ever
-        // modified and read by the master thread on that level.  Because of
+        // modified and read by the primary thread on that level.  Because of
         // this, we can always use the first sh buffer.
         auto sh = &(parent->hier_barrier.sh[0]);
         KMP_DEBUG_ASSERT(sh);
@@ -784,7 +784,7 @@ template <typename T> struct kmp_hier_t {
           }
         }
         parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
-      } // if master thread of lowest unit level
+      } // if primary thread of lowest unit level
       parent->barrier(pr->get_hier_id(), tdata);
       if (unit_id != 0) {
         *p_lb = parent->get_curr_lb(tdata->index);
@@ -975,7 +975,7 @@ void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
   KMP_DEBUG_ASSERT(sh);
   pr->flags.use_hier = TRUE;
   pr->u.p.tc = 0;
-  // Have master allocate the hierarchy
+  // Have primary thread allocate the hierarchy
   if (__kmp_tid_from_gtid(gtid) == 0) {
     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
                   "hierarchy\n",
@@ -1071,7 +1071,7 @@ void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
       break;
     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
-    // Only master threads of this unit within the hierarchy do initialization
+    // Only primary threads of this unit within the hierarchy do initialization
     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
                   gtid, i));
     my_unit->reset_shared_barrier();

diff  --git a/openmp/runtime/src/kmp_error.cpp b/openmp/runtime/src/kmp_error.cpp
index e0bff8a03075..cf5749dfd9fb 100644
--- a/openmp/runtime/src/kmp_error.cpp
+++ b/openmp/runtime/src/kmp_error.cpp
@@ -35,7 +35,8 @@ static char const *cons_text_c[] = {
     "\"ordered\"", /* in PDO */
     "\"master\"",
     "\"reduce\"",
-    "\"barrier\""};
+    "\"barrier\"",
+    "\"masked\""};
 
 #define get_src(ident) ((ident) == NULL ? NULL : (ident)->psource)
 
@@ -316,7 +317,7 @@ __kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_l
       /* we are in CRITICAL which is inside a CRITICAL construct of same name */
       __kmp_error_construct2(kmp_i18n_msg_CnsNestingSameName, ct, ident, &cons);
     }
-  } else if (ct == ct_master || ct == ct_reduce) {
+  } else if (ct == ct_master || ct == ct_masked || ct == ct_reduce) {
     if (p->w_top > p->p_top) {
       /* inside a WORKSHARING construct for this PARALLEL region */
       __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,

diff  --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 034e3ee2acc5..6a4f21383afb 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -438,7 +438,7 @@ kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
 
 /* ------------------------------------------------------ */
 /* STATE mostly syncronized with global lock */
-/* data written to rarely by masters, read often by workers */
+/* data written to rarely by primary threads, read often by workers */
 /* TODO: None of this global padding stuff works consistently because the order
    of declaration is not necessarily correlated to storage order. To fix this,
    all the important globals must be put in a big structure instead. */
@@ -446,7 +446,7 @@ KMP_ALIGN_CACHE
 kmp_info_t **__kmp_threads = NULL;
 kmp_root_t **__kmp_root = NULL;
 
-/* data read/written to often by masters */
+/* data read/written to often by primary threads */
 KMP_ALIGN_CACHE
 volatile int __kmp_nth = 0;
 volatile int __kmp_all_nth = 0;

diff  --git a/openmp/runtime/src/kmp_gsupport.cpp b/openmp/runtime/src/kmp_gsupport.cpp
index c622b81d8b73..e57641351a04 100644
--- a/openmp/runtime/src/kmp_gsupport.cpp
+++ b/openmp/runtime/src/kmp_gsupport.cpp
@@ -1877,7 +1877,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT)(
   va_end(args);
 }
 
-// fn: the function each master thread of new team will call
+// fn: the function each primary thread of new team will call
 // data: argument to fn
 // num_teams, thread_limit: max bounds on respective ICV
 // flags: unused

diff  --git a/openmp/runtime/src/kmp_itt.h b/openmp/runtime/src/kmp_itt.h
index 4340b188409f..9872764a375c 100644
--- a/openmp/runtime/src/kmp_itt.h
+++ b/openmp/runtime/src/kmp_itt.h
@@ -53,9 +53,9 @@ void __kmp_itt_reset();
 // --- Parallel region reporting ---
 __kmp_inline void
 __kmp_itt_region_forking(int gtid, int team_size,
-                         int barriers); // Master only, before forking threads.
+                         int barriers); // Primary only, before forking threads.
 __kmp_inline void
-__kmp_itt_region_joined(int gtid); // Master only, after joining threads.
+__kmp_itt_region_joined(int gtid); // Primary only, after joining threads.
 // (*) Note: A thread may execute tasks after this point, though.
 
 // --- Frame reporting ---
@@ -191,7 +191,7 @@ __kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
 #define SSC_MARK_SPIN_END() INSERT_SSC_MARK(0x4377)
 
 // Markers for architecture simulation.
-// FORKING      : Before the master thread forks.
+// FORKING      : Before the primary thread forks.
 // JOINING      : At the start of the join.
 // INVOKING     : Before the threads invoke microtasks.
 // DISPATCH_INIT: At the start of dynamically scheduled loop.

diff  --git a/openmp/runtime/src/kmp_itt.inl b/openmp/runtime/src/kmp_itt.inl
index e7c6041d619c..ecfcb966bb79 100644
--- a/openmp/runtime/src/kmp_itt.inl
+++ b/openmp/runtime/src/kmp_itt.inl
@@ -64,14 +64,14 @@ static kmp_bootstrap_lock_t metadata_lock =
     KMP_BOOTSTRAP_LOCK_INITIALIZER(metadata_lock);
 
 /* Parallel region reporting.
- * __kmp_itt_region_forking should be called by master thread of a team.
+ * __kmp_itt_region_forking should be called by primary thread of a team.
    Exact moment of call does not matter, but it should be completed before any
    thread of this team calls __kmp_itt_region_starting.
  * __kmp_itt_region_starting should be called by each thread of a team just
    before entering parallel region body.
  * __kmp_itt_region_finished should be called by each thread of a team right
    after returning from parallel region body.
- * __kmp_itt_region_joined should be called by master thread of a team, after
+ * __kmp_itt_region_joined should be called by primary thread of a team, after
    all threads called __kmp_itt_region_finished.
 
  Note: Thread waiting at join barrier (after __kmp_itt_region_finished) can
@@ -448,10 +448,10 @@ LINKAGE void __kmp_itt_region_joined(int gtid) {
 /* Barriers reporting.
 
    A barrier consists of two phases:
-   1. Gather -- master waits for arriving of all the worker threads; each
+   1. Gather -- primary thread waits for all worker threads to arrive; each
       worker thread registers arrival and goes further.
-   2. Release -- each worker threads waits until master lets it go; master lets
-      worker threads go.
+   2. Release -- each worker thread waits until primary thread lets it go;
+      primary thread lets worker threads go.
 
    Function should be called by each thread:
    * __kmp_itt_barrier_starting() -- before arriving to the gather phase.
@@ -487,7 +487,7 @@ void *__kmp_itt_barrier_object(int gtid, int bt, int set_name,
   // solution, and reporting fork/join barriers to ITT should be revisited.
 
   if (team != NULL) {
-    // Master thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time.
+    // Primary thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time.
     // Divide b_arrived by KMP_BARRIER_STATE_BUMP to get plain barrier counter.
     kmp_uint64 counter =
         team->t.t_bar[bt].b_arrived / KMP_BARRIER_STATE_BUMP + delta;
@@ -550,12 +550,13 @@ void *__kmp_itt_barrier_object(int gtid, int bt, int set_name,
       case bs_forkjoin_barrier: {
         // In case of fork/join barrier we can read thr->th.th_ident, because it
         // contains location of last passed construct (while join barrier is not
-        // such one). Use th_ident of master thread instead -- __kmp_join_call()
-        // called by the master thread saves location.
+        // such one). Use th_ident of primary thread instead --
+        // __kmp_join_call() called by the primary thread saves location.
         //
-        // AC: cannot read from master because __kmp_join_call may be not called
-        //    yet, so we read the location from team. This is the same location.
-        //    And team is valid at the enter to join barrier where this happens.
+        // AC: cannot read from primary thread because __kmp_join_call may not
+        //    be called yet, so we read the location from team. This is the
+        //    same location. Team is valid on entry to join barrier where this
+        //    happens.
         loc = team->t.t_ident;
         if (loc != NULL) {
           src = loc->psource;
@@ -958,7 +959,7 @@ void __kmp_itt_thread_name(int gtid) {
     kmp_str_buf_t name;
     __kmp_str_buf_init(&name);
     if (KMP_MASTER_GTID(gtid)) {
-      __kmp_str_buf_print(&name, "OMP Master Thread #%d", gtid);
+      __kmp_str_buf_print(&name, "OMP Primary Thread #%d", gtid);
     } else {
       __kmp_str_buf_print(&name, "OMP Worker Thread #%d", gtid);
     }
@@ -986,9 +987,9 @@ void __kmp_itt_system_object_created(void *object, char const *name) {
 } // __kmp_itt_system_object_created
 
 /* Stack stitching api.
-   Master calls "create" and put the stitching id into team structure.
+   Primary thread calls "create" and put the stitching id into team structure.
    Workers read the stitching id and call "enter" / "leave" api.
-   Master calls "destroy" at the end of the parallel region. */
+   Primary thread calls "destroy" at the end of the parallel region. */
 
 __itt_caller __kmp_itt_stack_caller_create() {
 #if USE_ITT_NOTIFY

diff  --git a/openmp/runtime/src/kmp_omp.h b/openmp/runtime/src/kmp_omp.h
index c7ba32a14338..995241ff65cd 100644
--- a/openmp/runtime/src/kmp_omp.h
+++ b/openmp/runtime/src/kmp_omp.h
@@ -123,7 +123,7 @@ typedef struct {
 
   /* team structure information */
   kmp_int32 t_sizeof_struct;
-  offset_and_size_t t_master_tid; // tid of master in parent team
+  offset_and_size_t t_master_tid; // tid of primary thread in parent team
   offset_and_size_t t_ident; // location of parallel region
   offset_and_size_t t_parent; // parent team
   offset_and_size_t t_nproc; // # team threads
@@ -136,7 +136,7 @@ typedef struct {
   offset_and_size_t t_cancel_request;
   offset_and_size_t t_bar;
   offset_and_size_t
-      t_b_master_arrived; // increased by 1 when master arrives to a barrier
+      t_b_master_arrived; // incremented when primary thread reaches barrier
   offset_and_size_t
       t_b_team_arrived; // increased by one when all the threads arrived
 

diff  --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index f77196faa11d..8f42a9d3fe0c 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -684,8 +684,8 @@ int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
 #if USE_ITT_BUILD
     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
-        team->t.t_active_level ==
-            1) { // Only report metadata by master of active team at level 1
+        team->t.t_active_level == 1) {
+      // Only report metadata by primary thread of active team at level 1
       __kmp_itt_metadata_single(id_ref);
     }
 #endif /* USE_ITT_BUILD */
@@ -911,7 +911,7 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
   KMP_MB();
 
-  /* first, let's setup the master thread */
+  /* first, let's setup the primary thread */
   master_th->th.th_info.ds.ds_tid = 0;
   master_th->th.th_team = team;
   master_th->th.th_team_nproc = team->t.t_nproc;
@@ -956,7 +956,7 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
 #endif
   if (!use_hot_team) {
 
-    /* install the master thread */
+    /* install the primary thread */
     team->t.t_threads[0] = master_th;
     __kmp_initialize_info(master_th, team, 0, master_gtid);
 
@@ -1019,7 +1019,7 @@ inline static void propagateFPControl(kmp_team_t *team) {
     kmp_int16 x87_fpu_control_word;
     kmp_uint32 mxcsr;
 
-    // Get master values of FPU control flags (both X87 and vector)
+    // Get primary thread's values of FPU control flags (both X87 and vector)
     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
     __kmp_store_mxcsr(&mxcsr);
     mxcsr &= KMP_X86_MXCSR_MASK;
@@ -1076,7 +1076,7 @@ static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
                                      int realloc); // forward declaration
 
 /* Run a parallel region that has been serialized, so runs only in a team of the
-   single master thread. */
+   single primary thread. */
 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   kmp_info_t *this_thr;
   kmp_team_t *serial_team;
@@ -1600,12 +1600,12 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       if (call_context == fork_context_gnu)
         return TRUE;
 
-      /* Invoke microtask for MASTER thread */
+      /* Invoke microtask for PRIMARY thread */
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
 
       if (!parent_team->t.t_invoke(gtid)) {
-        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
+        KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
       }
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
@@ -2006,7 +2006,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
     }
     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
-    // set master's schedule as new run-time schedule
+    // set primary thread's schedule as new run-time schedule
     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
 
     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
@@ -2016,18 +2016,18 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     propagateFPControl(team);
 
     if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // Set master's task team to team's task team. Unless this is hot team, it
-      // should be NULL.
+      // Set primary thread's task team to team's task team. Unless this is hot
+      // team, it should be NULL.
       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
                        parent_team->t.t_task_team[master_th->th.th_task_state]);
-      KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
+      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
                     "%p, new task_team %p / team %p\n",
                     __kmp_gtid_from_thread(master_th),
                     master_th->th.th_task_team, parent_team,
                     team->t.t_task_team[master_th->th.th_task_state], team));
 
       if (active_level || master_th->th.th_task_team) {
-        // Take a memo of master's task_state
+        // Take a memo of primary thread's task_state
         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
         if (master_th->th.th_task_state_top >=
             master_th->th.th_task_state_stack_sz) { // increase size
@@ -2047,7 +2047,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
           master_th->th.th_task_state_stack_sz = new_size;
           __kmp_free(old_stack);
         }
-        // Store master's task_state on stack
+        // Store primary thread's task_state on stack
         master_th->th
             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
             master_th->th.th_task_state;
@@ -2056,7 +2056,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
         if (master_th->th.th_hot_teams &&
             active_level < __kmp_hot_teams_max_level &&
             team == master_th->th.th_hot_teams[active_level].hot_team) {
-          // Restore master's nested state if nested hot team
+          // Restore primary thread's nested state if nested hot team
           master_th->th.th_task_state =
               master_th->th
                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
@@ -2163,7 +2163,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     }
 #endif /* USE_ITT_BUILD */
 
-    // AC: skip __kmp_internal_fork at teams construct, let only master
+    // AC: skip __kmp_internal_fork at teams construct, let only primary
     // threads execute
     if (ap) {
       __kmp_internal_fork(loc, gtid, team);
@@ -2177,7 +2177,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       return TRUE;
     }
 
-    /* Invoke microtask for MASTER thread */
+    /* Invoke microtask for PRIMARY thread */
     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                   team->t.t_id, team->t.t_pkfn));
   } // END of timer KMP_fork_call block
@@ -2191,7 +2191,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 #endif
 
   if (!team->t.t_invoke(gtid)) {
-    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
+    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
   }
 
 #if KMP_STATS_ENABLED
@@ -2397,7 +2397,7 @@ void __kmp_join_call(ident_t *loc, int gtid
 
     // Restore number of threads in the team if needed. This code relies on
     // the proper adjustment of th_teams_size.nth after the fork in
-    // __kmp_teams_master on each teams master in the case that
+    // __kmp_teams_master on each teams primary thread in the case that
     // __kmp_reserve_threads reduced it.
     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
       int old_num = master_th->th.th_team_nproc;
@@ -2513,7 +2513,7 @@ void __kmp_join_call(ident_t *loc, int gtid
     if (master_th->th.th_task_state_top >
         0) { // Restore task state from memo stack
       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-      // Remember master's state if we re-use this nested hot team
+      // Remember primary thread's state if we re-use this nested hot team
       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
           master_th->th.th_task_state;
       --master_th->th.th_task_state_top; // pop
@@ -2522,11 +2522,11 @@ void __kmp_join_call(ident_t *loc, int gtid
           master_th->th
               .th_task_state_memo_stack[master_th->th.th_task_state_top];
     }
-    // Copy the task team from the parent team to the master thread
+    // Copy the task team from the parent team to the primary thread
     master_th->th.th_task_team =
         parent_team->t.t_task_team[master_th->th.th_task_state];
     KA_TRACE(20,
-             ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
+             ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
               parent_team));
   }
@@ -3161,8 +3161,8 @@ static void __kmp_initialize_root(kmp_root_t *root) {
 #endif
                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
                           0 // argc
-                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
-      );
+                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
+                          );
 #if USE_DEBUGGER
   // Non-NULL value should be assigned to make the debugger display the root
   // team.
@@ -3198,8 +3198,8 @@ static void __kmp_initialize_root(kmp_root_t *root) {
 #endif
                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
                           0 // argc
-                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
-      );
+                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
+                          );
   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
 
   root->r.r_hot_team = hot_team;
@@ -3333,7 +3333,7 @@ void __kmp_print_structure(void) {
         __kmp_print_structure_team("    Serial Team:  ",
                                    thread->th.th_serial_team);
         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
-        __kmp_print_structure_thread("    Master:       ",
+        __kmp_print_structure_thread("    Primary:      ",
                                      thread->th.th_team_master);
         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
@@ -3381,7 +3381,7 @@ void __kmp_print_structure(void) {
     int i;
     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
-    __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
+    __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
@@ -3750,7 +3750,7 @@ int __kmp_register_root(int initial_thread) {
   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
   TCW_4(__kmp_init_gtid, TRUE);
 
-  /* prepare the master thread for get_gtid() */
+  /* prepare the primary thread for get_gtid() */
   __kmp_gtid_set_specific(gtid);
 
 #if USE_ITT_BUILD
@@ -3846,7 +3846,7 @@ static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
   KMP_DEBUG_ASSERT(level < max_level);
   kmp_team_t *team = hot_teams[level].hot_team;
   nth = hot_teams[level].hot_team_nth;
-  n = nth - 1; // master is not freed
+  n = nth - 1; // primary thread is not freed
   if (level < max_level - 1) {
     for (i = 0; i < nth; ++i) {
       kmp_info_t *th = team->t.t_threads[i];
@@ -4104,9 +4104,9 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
     this_thr->th.th_pri_head = NULL;
   }
 
-  if (this_thr != master && // Master's CG root is initialized elsewhere
+  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
-    // Make new thread's CG root same as master's
+    // Make new thread's CG root same as primary thread's
     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
     if (tmp) {
@@ -4287,11 +4287,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
       // The reason is that if the library is loaded/unloaded in a loop with
       // small (parallel) work in between, then there is high probability that
       // monitor thread started after the library shutdown. At shutdown it is
-      // too late to cope with the problem, because when the master is in
-      // DllMain (process detach) the monitor has no chances to start (it is
-      // blocked), and master has no means to inform the monitor that the
-      // library has gone, because all the memory which the monitor can access
-      // is going to be released/reset.
+      // too late to cope with the problem, because when the primary thread is
+      // in DllMain (process detach) the monitor has no chances to start (it is
+      // blocked), and primary thread has no means to inform the monitor that
+      // the library has gone, because all the memory which the monitor can
+      // access is going to be released/reset.
       while (TCR_4(__kmp_init_monitor) < 2) {
         KMP_YIELD(TRUE);
       }
@@ -4361,7 +4361,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     __kmp_print_thread_storage_map(new_thr, new_gtid);
   }
 
-  // add the reserve serialized team, initialized from the team's master thread
+  // add the reserve serialized team, initialized from the team's primary thread
   {
     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
@@ -4485,7 +4485,7 @@ static void __kmp_reinitialize_team(kmp_team_t *team,
   KMP_CHECK_UPDATE(team->t.t_ident, loc);
 
   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
-  // Copy ICVs to the master thread's implicit taskdata
+  // Copy ICVs to the primary thread's implicit taskdata
   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
 
@@ -4571,11 +4571,11 @@ __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
 #if KMP_AFFINITY_SUPPORTED
 
 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
-// It calculates the worker + master thread's partition based upon the parent
+// It calculates the worker + primary thread's partition based upon the parent
 // thread's partition, and binds each worker to a thread in their partition.
-// The master thread's partition should already include its current binding.
+// The primary thread's partition should already include its current binding.
 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
-  // Copy the master thread's place partition to the team struct
+  // Copy the primary thread's place partition to the team struct
   kmp_info_t *master_th = team->t.t_threads[0];
   KMP_DEBUG_ASSERT(master_th != NULL);
   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
@@ -4593,12 +4593,12 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
   switch (proc_bind) {
 
   case proc_bind_default:
-    // serial teams might have the proc_bind policy set to proc_bind_default. It
-    // doesn't matter, as we don't rebind master thread for any proc_bind policy
+    // Serial teams might have the proc_bind policy set to proc_bind_default.
+    // Not an issue -- we don't rebind primary thread for any proc_bind policy.
     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
     break;
 
-  case proc_bind_master: {
+  case proc_bind_primary: {
     int f;
     int n_th = team->t.t_nproc;
     for (f = 1; f < n_th; f++) {
@@ -4612,7 +4612,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
         team->t.t_display_affinity = 1;
       }
 
-      KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
+      KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
                      "partition = [%d,%d]\n",
                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
                      f, masters_place, first_place, last_place));
@@ -5000,7 +5000,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
       kmp_r_sched_t new_sched = new_icvs->sched;
-      // set master's schedule as new run-time schedule
+      // set primary thread's schedule as new run-time schedule
       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
 
       __kmp_reinitialize_team(team, new_icvs,
@@ -5080,7 +5080,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
       }
 
-      // restore the current task state of the master thread: should be the
+      // restore the current task state of the primary thread: should be the
       // implicit task
       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
                     team->t.t_threads[0], team));
@@ -5150,10 +5150,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         }
 
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
-        /* Temporarily set full mask for master thread before creation of
-           workers. The reason is that workers inherit the affinity from master,
-           so if a lot of workers are created on the single core quickly, they
-           don't get a chance to set their own affinity for a long time. */
+        /* Temporarily set full mask for primary thread before creation of
+           workers. The reason is that workers inherit the affinity from the
+           primary thread, so if a lot of workers are created on the single
+           core quickly, they don't get a chance to set their own affinity for
+           a long time. */
         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
 #endif
 
@@ -5186,7 +5187,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
         if (KMP_AFFINITY_CAPABLE()) {
-          /* Restore initial master thread's affinity mask */
+          /* Restore initial primary thread's affinity mask */
           __kmp_set_system_affinity(old_mask, TRUE);
           KMP_CPU_FREE(old_mask);
         }
@@ -5209,15 +5210,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       if (level) { // set th_task_state for new threads in nested hot team
         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
         // only need to set the th_task_state for the new threads. th_task_state
-        // for master thread will not be accurate until after this in
-        // __kmp_fork_call(), so we look to the master's memo_stack to get the
-        // correct value.
+        // for primary thread will not be accurate until after this in
+        // __kmp_fork_call(), so we look to the primary thread's memo_stack to
+        // get the correct value.
         for (f = old_nproc; f < team->t.t_nproc; ++f)
           team->t.t_threads[f]->th.th_task_state =
               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
       } else { // set th_task_state for new threads in non-nested hot team
-        kmp_uint8 old_state =
-            team->t.t_threads[0]->th.th_task_state; // copy master's state
+        // copy primary thread's state
+        kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
         for (f = old_nproc; f < team->t.t_nproc; ++f)
           team->t.t_threads[f]->th.th_task_state = old_state;
       }
@@ -5518,7 +5519,7 @@ void __kmp_free_team(kmp_root_t *root,
     /* TODO limit size of team pool, call reap_team if pool too large */
     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
     __kmp_team_pool = (volatile kmp_team_t *)team;
-  } else { // Check if team was created for the masters in a teams construct
+  } else { // Check if team was created for primary threads in teams construct
     // See if first worker is a CG root
     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
                      team->t.t_threads[1]->th.th_cg_roots);
@@ -7314,7 +7315,7 @@ int __kmp_invoke_task_func(int gtid) {
 }
 
 void __kmp_teams_master(int gtid) {
-  // This routine is called by all master threads in teams construct
+  // This routine is called by all primary threads in teams construct
   kmp_info_t *thr = __kmp_threads[gtid];
   kmp_team_t *team = thr->th.th_team;
   ident_t *loc = team->t.t_ident;
@@ -7327,7 +7328,7 @@ void __kmp_teams_master(int gtid) {
   // This thread is a new CG root.  Set up the proper variables.
   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
   tmp->cg_root = thr; // Make thr the CG root
-  // Init to thread limit that was stored when league masters were forked
+  // Init to thread limit stored when league primary threads were forked
   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
@@ -7433,7 +7434,7 @@ static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
       num_threads = 1;
     }
   } else {
-    // This thread will be the master of the league masters
+    // This thread will be the primary thread of the league primary threads
     // Store new thread limit; old limit is saved in th_cg_roots list
     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
     // num_threads = min(num_threads, nthreads-var)
@@ -7676,10 +7677,10 @@ static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
   }
   hot_team = root->r.r_hot_team;
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
-    return hot_team->t.t_nproc - 1; // Don't count master thread
+    return hot_team->t.t_nproc - 1; // Don't count primary thread
   }
 
-  // Skip the master thread - it is accounted for elsewhere.
+  // Skip the primary thread - it is accounted for elsewhere.
   retval = 0;
   for (i = 1; i < hot_team->t.t_nproc; i++) {
     if (hot_team->t.t_threads[i]->th.th_active) {
@@ -7712,8 +7713,8 @@ static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
 
   // Threads that are active in the thread pool, active in the hot team for this
   // particular root (if we are at the outer par level), and the currently
-  // executing thread (to become the master) are available to add to the new
-  // team, but are currently contributing to the system load, and must be
+  // executing thread (to become the primary thread) are available to add to the
+  // new team, but are currently contributing to the system load, and must be
   // accounted for.
   pool_active = __kmp_thread_pool_active_nth;
   hot_team_active = __kmp_active_hot_team_nproc(root);

diff  --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp
index 7e6d6df7227f..c04c93985cbe 100644
--- a/openmp/runtime/src/kmp_sched.cpp
+++ b/openmp/runtime/src/kmp_sched.cpp
@@ -522,8 +522,8 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
         __kmp_static == kmp_sch_static_greedy ||
         __kmp_static ==
             kmp_sch_static_balanced); // Unknown static scheduling type.
-    // only masters of some teams get single iteration, other threads get
-    // nothing
+    // only primary threads of some teams get single iteration, other threads
+    // get nothing
     if (team_id < trip_count && tid == 0) {
       *pupper = *pupperDist = *plower = *plower + team_id * incr;
     } else {

diff  --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index fa63e55eac3b..35c15ee2a2e6 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -3239,11 +3239,12 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
     for (;;) {
       enum kmp_proc_bind_t bind;
 
-      if ((num == (int)proc_bind_master) ||
-          __kmp_match_str("master", buf, &next)) {
+      if ((num == (int)proc_bind_primary) ||
+          __kmp_match_str("master", buf, &next) ||
+          __kmp_match_str("primary", buf, &next)) {
         buf = next;
         SKIP_WS(buf);
-        bind = proc_bind_master;
+        bind = proc_bind_primary;
       } else if ((num == (int)proc_bind_close) ||
                  __kmp_match_str("close", buf, &next)) {
         buf = next;
@@ -3311,8 +3312,8 @@ static void __kmp_stg_print_proc_bind(kmp_str_buf_t *buffer, char const *name,
         __kmp_str_buf_print(buffer, "true");
         break;
 
-      case proc_bind_master:
-        __kmp_str_buf_print(buffer, "master");
+      case proc_bind_primary:
+        __kmp_str_buf_print(buffer, "primary");
         break;
 
       case proc_bind_close:

diff  --git a/openmp/runtime/src/kmp_stats.cpp b/openmp/runtime/src/kmp_stats.cpp
index 47a80d97c2fd..8657bfe18c44 100644
--- a/openmp/runtime/src/kmp_stats.cpp
+++ b/openmp/runtime/src/kmp_stats.cpp
@@ -836,11 +836,11 @@ void kmp_stats_output_module::outputStats(const char *heading) {
     // Accumulate timers.
     for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
       // See if we should ignore this timer when aggregating
-      if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on master
-                                                   // and this thread is worker
+      if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on
+          // primary thread and this thread is worker
           (timeStat::workerOnly(s) && (t == 0)) // Timer only valid on worker
-          // and this thread is the master
-      ) {
+          // and this thread is the primary thread
+          ) {
         continue;
       }
 

diff  --git a/openmp/runtime/src/kmp_stats.h b/openmp/runtime/src/kmp_stats.h
index 1a1ec8ba2720..4c5053df3fef 100644
--- a/openmp/runtime/src/kmp_stats.h
+++ b/openmp/runtime/src/kmp_stats.h
@@ -48,9 +48,9 @@
  */
 enum stats_flags_e {
   noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
-  onlyInMaster = 1 << 1, //!< statistic is valid only for master
+  onlyInMaster = 1 << 1, //!< statistic is valid only for primary thread
   noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
-  notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
+  notInMaster = 1 << 3, //!< statistic is valid only for non-primary threads
   logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
   //! KMP_STATS_EVENTS is on (valid only for timers)
 };
@@ -103,6 +103,7 @@ enum stats_state_e {
   macro(OMP_CRITICAL, 0, arg)                                                  \
   macro(OMP_SINGLE, 0, arg)                                                    \
   macro(OMP_MASTER, 0, arg)                                                    \
+  macro(OMP_MASKED, 0, arg)                                                    \
   macro(OMP_TEAMS, 0, arg)                                                     \
   macro(OMP_set_lock, 0, arg)                                                  \
   macro(OMP_test_lock, 0, arg)                                                 \
@@ -150,6 +151,7 @@ enum stats_state_e {
   macro (OMP_critical_wait, 0, arg)                                            \
   macro (OMP_single, 0, arg)                                                   \
   macro (OMP_master, 0, arg)                                                   \
+  macro (OMP_masked, 0, arg)                                                   \
   macro (OMP_task_immediate, 0, arg)                                           \
   macro (OMP_task_taskwait, 0, arg)                                            \
   macro (OMP_task_taskyield, 0, arg)                                           \
@@ -180,8 +182,8 @@ enum stats_state_e {
 // clang-format on
 
 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
-//                           initializing OpenMP or being created by a master)
-//                           until the thread is destroyed
+//                           initializing OpenMP or being created by a primary
+//                           thread) until the thread is destroyed
 // OMP_parallel           -- Time thread spends executing work directly
 //                           within a #pragma omp parallel
 // OMP_parallel_overhead  -- Time thread spends setting up a parallel region
@@ -198,6 +200,7 @@ enum stats_state_e {
 //                           a critical section
 // OMP_single             -- Time spent executing a "single" region
 // OMP_master             -- Time spent executing a "master" region
+// OMP_masked             -- Time spent executing a "masked" region
 // OMP_task_immediate     -- Time spent executing non-deferred tasks
 // OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
 //                           construct
@@ -710,7 +713,7 @@ class kmp_stats_event_vector {
     to the bar width in the timeline graph.
 
     Every thread will have a thread local pointer to its node in
-    the list.  The sentinel node is used by the master thread to
+    the list.  The sentinel node is used by the primary thread to
     store "dummy" statistics before __kmp_create_worker() is called.
 **************************************************************** */
 class kmp_stats_list {

diff  --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index ea14f49678fa..f448e215a608 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -2839,7 +2839,7 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
   if (*thread_finished) {
     // We need to un-mark this victim as a finished victim.  This must be done
     // before releasing the lock, or else other threads (starting with the
-    // master victim) might be prematurely released from the barrier!!!
+    // primary thread victim) might be prematurely released from the barrier!!!
     kmp_int32 count;
 
     count = KMP_ATOMIC_INC(unfinished_threads);
@@ -3051,7 +3051,7 @@ static inline int __kmp_execute_tasks_template(
       }
 
       // It is now unsafe to reference thread->th.th_team !!!
-      // Decrementing task_team->tt.tt_unfinished_threads can allow the master
+      // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
       // thread to pass through the barrier, where it might reset each thread's
       // th.th_team field for the next parallel region. If we can steal more
       // work, we know that this has not happened yet.
@@ -3064,8 +3064,8 @@ static inline int __kmp_execute_tasks_template(
       }
     }
 
-    // If this thread's task team is NULL, master has recognized that there are
-    // no more tasks; bail out
+    // If this thread's task team is NULL, primary thread has recognized that
+    // there are no more tasks; bail out
     if (thread->th.th_task_team == NULL) {
       KA_TRACE(15,
                ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
@@ -3205,7 +3205,7 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  * After a child * thread checks into a barrier and calls __kmp_release() from
  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
  * longer assume that the kmp_team_t structure is intact (at any moment, the
- * master thread may exit the barrier code and free the team data structure,
+ * primary thread may exit the barrier code and free the team data structure,
  * and return the threads to the thread pool).
  *
  * This does not work with the tasking code, as the thread is still
@@ -3214,11 +3214,11 @@ static void __kmp_enable_tasking(kmp_task_team_t *task_team,
  * to each thread in the team, so that it can steal work from it.
  *
  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
- * counting mechanism, and is allocated by the master thread before calling
+ * counting mechanism, and is allocated by the primary thread before calling
  * __kmp_<barrier_kind>_release, and then is release by the last thread to
  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
  * of the kmp_task_team_t structs for consecutive barriers can overlap
- * (and will, unless the master thread is the last thread to exit the barrier
+ * (and will, unless the primary thread is the last thread to exit the barrier
  * release phase, which is not typical). The existence of such a struct is
  * useful outside the context of tasking.
  *
@@ -3590,8 +3590,8 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
       (always || team->t.t_nproc > 1)) {
     team->t.t_task_team[this_thr->th.th_task_state] =
         __kmp_allocate_task_team(this_thr, team);
-    KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
-                  "for team %d at parity=%d\n",
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+                  " for team %d at parity=%d\n",
                   __kmp_gtid_from_thread(this_thr),
                   team->t.t_task_team[this_thr->th.th_task_state],
                   ((team != NULL) ? team->t.t_id : -1),
@@ -3603,14 +3603,14 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
   // threads spin in the barrier release phase, they will continue to use the
   // previous task_team struct(above), until they receive the signal to stop
   // checking for tasks (they can't safely reference the kmp_team_t struct,
-  // which could be reallocated by the master thread). No task teams are formed
+  // which could be reallocated by the primary thread). No task teams are formed
   // for serialized teams.
   if (team->t.t_nproc > 1) {
     int other_team = 1 - this_thr->th.th_task_state;
     if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
       team->t.t_task_team[other_team] =
           __kmp_allocate_task_team(this_thr, team);
-      KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
+      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
                     "task_team %p for team %d at parity=%d\n",
                     __kmp_gtid_from_thread(this_thr),
                     team->t.t_task_team[other_team],
@@ -3629,7 +3629,7 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
       }
       // if team size has changed, the first thread to enable tasking will
       // realloc threads_data if necessary
-      KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
+      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
                     "%p for team %d at parity=%d\n",
                     __kmp_gtid_from_thread(this_thr),
                     team->t.t_task_team[other_team],
@@ -3679,12 +3679,12 @@ void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
             ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
 }
 
-// __kmp_task_team_wait: Master thread waits for outstanding tasks after the
-// barrier gather phase. Only called by master thread if #threads in team > 1 or
-// if proxy tasks were created.
+// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
+// barrier gather phase. Only called by primary thread if #threads in team > 1
+// or if proxy tasks were created.
 //
 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
-// by passing in 0 optionally as the last argument. When wait is zero, master
+// by passing in 0 optionally as the last argument. When wait is zero, primary
 // thread does not wait for unfinished_threads to reach 0.
 void __kmp_task_team_wait(
     kmp_info_t *this_thr,
@@ -3696,12 +3696,12 @@ void __kmp_task_team_wait(
 
   if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
     if (wait) {
-      KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
+      KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
                     "(for unfinished_threads to reach 0) on task_team = %p\n",
                     __kmp_gtid_from_thread(this_thr), task_team));
       // Worker threads may have dropped through to release phase, but could
       // still be executing tasks. Wait here for tasks to complete. To avoid
-      // memory contention, only master thread checks termination condition.
+      // memory contention, only primary thread checks termination condition.
       kmp_flag_32<false, false> flag(
           RCAST(std::atomic<kmp_uint32> *,
                 &task_team->tt.tt_unfinished_threads),
@@ -3712,7 +3712,7 @@ void __kmp_task_team_wait(
     // referencing it while spinning.
     KA_TRACE(
         20,
-        ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
+        ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
          "setting active to false, setting local and team's pointer to NULL\n",
          __kmp_gtid_from_thread(this_thr), task_team));
     KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||

diff  --git a/openmp/runtime/src/kmp_threadprivate.cpp b/openmp/runtime/src/kmp_threadprivate.cpp
index 8cdd4c125f3f..b79ac7d6d2b2 100644
--- a/openmp/runtime/src/kmp_threadprivate.cpp
+++ b/openmp/runtime/src/kmp_threadprivate.cpp
@@ -169,7 +169,7 @@ void __kmp_common_destroy(void) {
       struct shared_common *d_tn;
 
       /* C++ destructors need to be called once per thread before exiting.
-         Don't call destructors for master thread though unless we used copy
+         Don't call destructors for primary thread though unless we used copy
          constructor */
 
       for (d_tn = __kmp_threadprivate_d_table.data[q]; d_tn;
@@ -451,15 +451,16 @@ struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
     return tn;
 
   /* if C++ object with copy constructor, use it;
-   * else if C++ object with constructor, use it for the non-master copies only;
+   * else if C++ object with constructor, use it for the non-primary thread
+     copies only;
    * else use pod_init and memcpy
    *
-   * C++ constructors need to be called once for each non-master thread on
+   * C++ constructors need to be called once for each non-primary thread on
    * allocate
    * C++ copy constructors need to be called once for each thread on allocate */
 
   /* C++ object with constructors/destructors; don't call constructors for
-     master thread though */
+     primary thread though */
   if (d_tn->is_vec) {
     if (d_tn->ct.ctorv != 0) {
       (void)(*d_tn->ct.ctorv)(tn->par_addr, d_tn->vec_len);