[Openmp-commits] [openmp] [OpenMP] Add skewed iteration distribution on hybrid systems (PR #69946)

Mon Oct 23 09:48:38 PDT 2023

https://github.com/jpeyton52 created https://github.com/llvm/llvm-project/pull/69946

This commit adds skewed distribution of iterations in `nonmonotonic:dynamic` schedule (static steal) for hybrid systems when thread affinity is assigned. Currently, it distributes the iterations at 60:40 ratio. Consider this loop with dynamic schedule type,
`for (int i = 0; i < 100; ++i)`. In a hybrid system with 20 hardware threads (16 CORE and 4 ATOM core), 88 iterations will be assigned to performance cores and 12 iterations will be assigned to efficient cores. Each thread with CORE core will process 5 iterations + extras and with ATOM core will process 3 iterations.

Original Phabricator Patch: https://reviews.llvm.org/D152955

>From 0217024481ad42c7e26b511edbce24ef758e8ad0 Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton at intel.com>
Date: Mon, 23 Oct 2023 10:38:06 -0500
Subject: [PATCH] [OpenMP] Add skewed iteration distribution on hybrid systems

This commit adds skewed distribution of iterations in nonmonotonic:dynamic
schedule (static steal) for hybrid systems when thread affinity is assigned.
Currently, it distributes the iterations at 60:40 ratio. Consider this loop with
dynamic schedule type, for (int i = 0; i < 100; ++i)
In a hybrid system with 20 hardware threads (16 CORE and 4 ATOM core), 88 iterations
will be assigned to performance cores and 12 iterations will be assigned
to efficient cores. Each thread with CORE core will process 5
iterations + extras and with ATOM core will process 3 iterations.
---
 openmp/runtime/src/kmp.h                      |  66 ++++--
 openmp/runtime/src/kmp_affinity.cpp           |  38 +++-
 openmp/runtime/src/kmp_dispatch.cpp           | 207 ++++++++++++++++--
 openmp/runtime/src/kmp_dispatch.h             |  14 +-
 openmp/runtime/src/kmp_global.cpp             |   3 +
 .../for/omp_for_schedule_dynamic.c            |   1 +
 6 files changed, 279 insertions(+), 50 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 339e4ca4be6b350..c854eeeb2202540 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -27,6 +27,9 @@
 #ifndef KMP_STATIC_STEAL_ENABLED
 #define KMP_STATIC_STEAL_ENABLED 1
 #endif
+#define KMP_WEIGHTED_ITERATIONS_SUPPORTED                                      \
+  (KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED &&                       \
+   (KMP_ARCH_X86 || KMP_ARCH_X86_64))
 
 #define TASK_CURRENT_NOT_QUEUED 0
 #define TASK_CURRENT_QUEUED 1
@@ -881,14 +884,8 @@ typedef struct kmp_affinity_flags_t {
 KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);
 
 typedef struct kmp_affinity_ids_t {
+  int os_id;
   int ids[KMP_HW_LAST];
-  int operator[](size_t idx) const { return ids[idx]; }
-  int &operator[](size_t idx) { return ids[idx]; }
-  kmp_affinity_ids_t &operator=(const kmp_affinity_ids_t &rhs) {
-    for (int i = 0; i < KMP_HW_LAST; ++i)
-      ids[i] = rhs[i];
-    return *this;
-  }
 } kmp_affinity_ids_t;
 
 typedef struct kmp_affinity_attrs_t {
@@ -938,6 +935,10 @@ extern kmp_affin_mask_t *__kmp_affin_fullMask;
 extern kmp_affin_mask_t *__kmp_affin_origMask;
 extern char *__kmp_cpuinfo_file;
 
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_first_osid_with_ecore;
+#endif
+
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 // This needs to be kept in sync with the values in omp.h !!!
@@ -1847,6 +1848,14 @@ typedef struct kmp_sched_flags {
   unsigned contains_last : 1;
 #if KMP_USE_HIER_SCHED
   unsigned use_hier : 1;
+#if KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED
+  unsigned use_hybrid : 1;
+  unsigned unused : 27;
+#else
+  unsigned unused : 28;
+#endif
+#elif KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED
+  unsigned use_hybrid : 1;
   unsigned unused : 28;
 #else
   unsigned unused : 29;
@@ -1864,26 +1873,37 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 st;
   kmp_int32 tc;
   kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+  kmp_uint32 ordered_lower;
+  kmp_uint32 ordered_upper;
+
   // KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are on the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
   // if they are on the same cache line (not measured though).
 
-  struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
-    kmp_int32 parm1; //     structures in kmp_dispatch.cpp. This should
-    kmp_int32 parm2; //     make no real change at least while padding is off.
+  struct KMP_ALIGN(32) {
+    kmp_int32 parm1;
+    kmp_int32 parm2;
     kmp_int32 parm3;
     kmp_int32 parm4;
   };
 
-  kmp_uint32 ordered_lower;
-  kmp_uint32 ordered_upper;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  kmp_uint32 pchunks;
+  kmp_uint32 num_procs_with_pcore;
+  kmp_int32 first_thread_with_ecore;
+#endif
 #if KMP_OS_WINDOWS
   kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
 
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128);
+#endif
+
 typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 count; // current chunk number for static & static-steal scheduling
   kmp_int64 ub; /* upper-bound */
@@ -1892,14 +1912,16 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 st; /* stride */
   kmp_int64 tc; /* trip count (number of iterations) */
   kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+  kmp_uint64 ordered_lower;
+  kmp_uint64 ordered_upper;
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
-  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+  // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
   // if they are in the same line (not measured though).
-
   struct KMP_ALIGN(32) {
     kmp_int64 parm1;
     kmp_int64 parm2;
@@ -1907,12 +1929,21 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
     kmp_int64 parm4;
   };
 
-  kmp_uint64 ordered_lower;
-  kmp_uint64 ordered_upper;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  kmp_uint64 pchunks;
+  kmp_uint64 num_procs_with_pcore;
+  kmp_int64 first_thread_with_ecore;
+#endif
+
 #if KMP_OS_WINDOWS
   kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
+
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128);
+#endif
+
 #else /* KMP_STATIC_STEAL_ENABLED */
 typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 lb;
@@ -3821,6 +3852,9 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
 extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_get_first_osid_with_ecore(void);
+#endif
 #if KMP_OS_LINUX || KMP_OS_FREEBSD
 extern int kmp_set_thread_affinity_mask_initial(void);
 #endif
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 20c1c610b9159e0..56b2e49cc30dede 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -4163,7 +4163,7 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
 
   // Initiailze ids and attrs thread data
   for (int i = 0; i < KMP_HW_LAST; ++i)
-    ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+    ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
   attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
 
   // Iterate through each os id within the mask and determine
@@ -4172,19 +4172,20 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
   int depth = __kmp_topology->get_depth();
   KMP_CPU_SET_ITERATE(cpu, mask) {
     int osid_idx = __kmp_osid_to_hwthread_map[cpu];
+    ids.os_id = cpu;
     const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
     for (int level = 0; level < depth; ++level) {
       kmp_hw_t type = __kmp_topology->get_type(level);
       int id = hw_thread.sub_ids[level];
-      if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids[type] == id) {
-        ids[type] = id;
+      if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
+        ids.ids[type] = id;
       } else {
         // This mask spans across multiple topology units, set it as such
         // and mark every level below as such as well.
-        ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+        ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
         for (; level < depth; ++level) {
           kmp_hw_t type = __kmp_topology->get_type(level);
-          ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+          ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
         }
       }
     }
@@ -4264,6 +4265,9 @@ static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
   if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
     machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
     __kmp_affinity_get_topology_info(affinity);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+    __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
+#endif
   }
 }
 
@@ -4843,7 +4847,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
 
   // Set the thread topology information to default of unknown
   for (int id = 0; id < KMP_HW_LAST; ++id)
-    th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
+    th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
   th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
 
   if (!KMP_AFFINITY_CAPABLE()) {
@@ -5240,6 +5244,28 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
   return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
 }
 
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Returns first os proc id with ATOM core
+int __kmp_get_first_osid_with_ecore(void) {
+  int low = 0;
+  int high = __kmp_topology->get_num_hw_threads() - 1;
+  int mid = 0;
+  while (high - low > 1) {
+    mid = (high + low) / 2;
+    if (__kmp_topology->at(mid).attrs.get_core_type() ==
+        KMP_HW_CORE_TYPE_CORE) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
+    return mid;
+  }
+  return -1;
+}
+#endif
+
 // Dynamic affinity settings - Affinity balanced
 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
   KMP_DEBUG_ASSERT(th);
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index a6ee844e598862d..db22b0a20770f2f 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -90,6 +90,69 @@ static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
   return monotonicity;
 }
 
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+static inline float __kmp_get_float_val(float num) {
+  return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
+}
+static inline int __kmp_get_round_val(float num) {
+  return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
+}
+#endif
+
+template <typename T>
+inline void
+__kmp_initialize_self_buffer(kmp_team_t *team, T id,
+                             dispatch_private_info_template<T> *pr,
+                             typename traits_t<T>::unsigned_t nchunks, T nproc,
+                             typename traits_t<T>::unsigned_t &init,
+                             T &small_chunk, T &extras, T &p_extra) {
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  if (pr->flags.use_hybrid) {
+    kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
+    kmp_hw_core_type_t type =
+        (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+    T pchunks = pr->u.p.pchunks;
+    T echunks = nchunks - pchunks;
+    T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
+    T num_procs_with_ecore = nproc - num_procs_with_pcore;
+    T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
+    T big_chunk =
+        pchunks / num_procs_with_pcore; // chunks per thread with p-core
+    small_chunk =
+        echunks / num_procs_with_ecore; // chunks per thread with e-core
+
+    extras =
+        (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
+
+    p_extra = (big_chunk - small_chunk);
+
+    if (type == KMP_HW_CORE_TYPE_CORE) {
+      if (id < first_thread_with_ecore) {
+        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+      } else {
+        init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+               (id < extras ? id : extras);
+      }
+    } else {
+      if (id == first_thread_with_ecore) {
+        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+      } else {
+        init = id * small_chunk + first_thread_with_ecore * p_extra +
+               (id < extras ? id : extras);
+      }
+    }
+    p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+    return;
+  }
+#endif
+
+  small_chunk = nchunks / nproc; // chunks per thread
+  extras = nchunks % nproc;
+  p_extra = 0;
+  init = id * small_chunk + (id < extras ? id : extras);
+}
+
 #if KMP_STATIC_STEAL_ENABLED
 enum { // values for steal_flag (possible states of private per-loop buffer)
   UNUSED = 0,
@@ -366,7 +429,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
   switch (schedule) {
 #if KMP_STATIC_STEAL_ENABLED
   case kmp_sch_static_steal: {
-    T ntc, init;
+    T ntc, init = 0;
 
     KD_TRACE(100,
              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
@@ -376,7 +439,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
     if (nproc > 1 && ntc >= nproc) {
       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
       T id = tid;
-      T small_chunk, extras;
+      T small_chunk, extras, p_extra = 0;
       kmp_uint32 old = UNUSED;
       int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
       if (traits_t<T>::type_size > 4) {
@@ -388,13 +451,109 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
         pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
         __kmp_init_lock(pr->u.p.steal_lock);
       }
-      small_chunk = ntc / nproc;
-      extras = ntc % nproc;
 
-      init = id * small_chunk + (id < extras ? id : extras);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+      // Iterations are divided in a 60/40 skewed distribution among CORE and
+      // ATOM processors for hybrid systems
+      bool use_hybrid = false;
+      kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+      T first_thread_with_ecore = 0;
+      T num_procs_with_pcore = 0;
+      T num_procs_with_ecore = 0;
+      T p_ntc = 0, e_ntc = 0;
+      if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
+          __kmp_affinity.type != affinity_explicit) {
+        use_hybrid = true;
+        core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+        if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
+            __kmp_first_osid_with_ecore > -1) {
+          for (int i = 0; i < team->t.t_nproc; ++i) {
+            kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
+                                          ->th.th_topology_attrs.core_type;
+            int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
+            if (id == __kmp_first_osid_with_ecore) {
+              first_thread_with_ecore =
+                  team->t.t_threads[i]->th.th_info.ds.ds_tid;
+            }
+            if (type == KMP_HW_CORE_TYPE_CORE) {
+              num_procs_with_pcore++;
+            } else if (type == KMP_HW_CORE_TYPE_ATOM) {
+              num_procs_with_ecore++;
+            } else {
+              use_hybrid = false;
+              break;
+            }
+          }
+        }
+        if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
+          float multiplier = 60.0 / 40.0;
+          float p_ratio = (float)num_procs_with_pcore / nproc;
+          float e_ratio = (float)num_procs_with_ecore / nproc;
+          float e_multiplier =
+              (float)1 /
+              (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
+          float p_multiplier = multiplier * e_multiplier;
+          p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
+          if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
+            e_ntc = (int)(__kmp_get_float_val(ntc * e_ratio * e_multiplier));
+          else
+            e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
+          KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
+
+          // Use regular static steal if not enough chunks for skewed
+          // distribution
+          use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
+                                       e_ntc >= num_procs_with_ecore)
+                            ? true
+                            : false);
+        } else {
+          use_hybrid = false;
+        }
+      }
+      pr->flags.use_hybrid = use_hybrid;
+      pr->u.p.pchunks = p_ntc;
+      pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
+      pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
+
+      if (use_hybrid) {
+        KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
+        T big_chunk = p_ntc / num_procs_with_pcore;
+        small_chunk = e_ntc / num_procs_with_ecore;
+
+        extras =
+            (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
+
+        p_extra = (big_chunk - small_chunk);
+
+        if (core_type == KMP_HW_CORE_TYPE_CORE) {
+          if (id < first_thread_with_ecore) {
+            init =
+                id * small_chunk + id * p_extra + (id < extras ? id : extras);
+          } else {
+            init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+                   (id < extras ? id : extras);
+          }
+        } else {
+          if (id == first_thread_with_ecore) {
+            init =
+                id * small_chunk + id * p_extra + (id < extras ? id : extras);
+          } else {
+            init = id * small_chunk + first_thread_with_ecore * p_extra +
+                   (id < extras ? id : extras);
+          }
+        }
+        p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+      } else
+#endif
+      {
+        small_chunk = ntc / nproc;
+        extras = ntc % nproc;
+        init = id * small_chunk + (id < extras ? id : extras);
+        p_extra = 0;
+      }
       pr->u.p.count = init;
       if (claimed) { // are we succeeded in claiming own buffer?
-        pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+        pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
         // Other threads will inspect steal_flag when searching for a victim.
         // READY means other threads may steal from this thread from now on.
         KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
@@ -1261,13 +1420,13 @@ int __kmp_dispatch_next_algorithm(int gtid,
             if (status) {
               // initialize self buffer with victim's whole range of chunks
               T id = victimId;
-              T small_chunk, extras;
-              small_chunk = nchunks / nproc; // chunks per thread
-              extras = nchunks % nproc;
-              init = id * small_chunk + (id < extras ? id : extras);
+              T small_chunk = 0, extras = 0, p_extra = 0;
+              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+                                              init, small_chunk, extras,
+                                              p_extra);
               __kmp_acquire_lock(lck, gtid);
               pr->u.p.count = init + 1; // exclude one we execute immediately
-              pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+              pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
               __kmp_release_lock(lck, gtid);
               pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
               // no need to reinitialize other thread invariants: lb, st, etc.
@@ -1275,10 +1434,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
               {
                 char *buff;
                 // create format specifiers before the debug output
-                buff = __kmp_str_format(
-                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
-                    "count:%%%s ub:%%%s\n",
-                    traits_t<UT>::spec, traits_t<T>::spec);
+                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                                        "stolen chunks from T#%%d, "
+                                        "count:%%%s ub:%%%s\n",
+                                        traits_t<UT>::spec, traits_t<T>::spec);
                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
                 __kmp_str_free(&buff);
               }
@@ -1404,12 +1563,12 @@ int __kmp_dispatch_next_algorithm(int gtid,
             if (status) {
               // initialize self buffer with victim's whole range of chunks
               T id = victimId;
-              T small_chunk, extras;
-              small_chunk = nchunks / nproc; // chunks per thread
-              extras = nchunks % nproc;
-              init = id * small_chunk + (id < extras ? id : extras);
+              T small_chunk = 0, extras = 0, p_extra = 0;
+              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+                                              init, small_chunk, extras,
+                                              p_extra);
               vnew.p.count = init + 1;
-              vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+              vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
               // write pair (count, ub) at once atomically
 #if KMP_ARCH_X86
               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
@@ -1422,10 +1581,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
               {
                 char *buff;
                 // create format specifiers before the debug output
-                buff = __kmp_str_format(
-                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
-                    "count:%%%s ub:%%%s\n",
-                    traits_t<UT>::spec, traits_t<T>::spec);
+                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                                        "stolen chunks from T#%%d, "
+                                        "count:%%%s ub:%%%s\n",
+                                        traits_t<UT>::spec, traits_t<T>::spec);
                 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
                 __kmp_str_free(&buff);
               }
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
index 154db174613dbb0..cf19eb52662cec3 100644
--- a/openmp/runtime/src/kmp_dispatch.h
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -75,14 +75,17 @@ template <typename T> struct dispatch_private_infoXX_template {
   ST st; // signed
   UT tc; // unsigned
   kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
-  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+  // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
   // if they are in the same line (not measured though).
-
   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
     T parm1;
     T parm2;
@@ -90,8 +93,11 @@ template <typename T> struct dispatch_private_infoXX_template {
     T parm4;
   };
 
-  UT ordered_lower; // unsigned
-  UT ordered_upper; // unsigned
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+  UT pchunks; // total number of chunks for processes with p-core
+  UT num_procs_with_pcore; // number of threads with p-core
+  T first_thread_with_ecore;
+#endif
 #if KMP_OS_WINDOWS
   T last_upper;
 #endif /* KMP_OS_WINDOWS */
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 48097fb530d1c66..b132f38fd3b0840 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -282,6 +282,9 @@ kmp_affinity_t __kmp_hh_affinity =
 kmp_affinity_t *__kmp_affinities[] = {&__kmp_affinity, &__kmp_hh_affinity};
 
 char *__kmp_cpuinfo_file = NULL;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+int __kmp_first_osid_with_ecore = -1;
+#endif
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
diff --git a/openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c b/openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
index 4433d2a3dafbe94..419187321d28d2d 100644
--- a/openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
+++ b/openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// RUN: env KMP_AFFINITY=compact,0 %libomp-run
 /*
  * Test for dynamic scheduling with chunk size
  * Method: calculate how many times the iteration space is dispatched