[Openmp-commits] [openmp] r319422 - Extension of HWLOC topology discovery with NUMA nodes and tiles
Andrey Churbanov via Openmp-commits
openmp-commits at lists.llvm.org
Thu Nov 30 03:51:47 PST 2017
Author: achurbanov
Date: Thu Nov 30 03:51:47 2017
New Revision: 319422
URL: http://llvm.org/viewvc/llvm-project?rev=319422&view=rev
Log:
Extension of HWLOC topology discovery with NUMA nodes and tiles
Patch by Olga Malysheva
Differential Revision: https://reviews.llvm.org/D40309
Modified:
openmp/trunk/runtime/src/i18n/en_US.txt
openmp/trunk/runtime/src/kmp.h
openmp/trunk/runtime/src/kmp_affinity.cpp
openmp/trunk/runtime/src/kmp_global.cpp
openmp/trunk/runtime/src/kmp_settings.cpp
Modified: openmp/trunk/runtime/src/i18n/en_US.txt
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/i18n/en_US.txt?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/i18n/en_US.txt (original)
+++ openmp/trunk/runtime/src/i18n/en_US.txt Thu Nov 30 03:51:47 2017
@@ -103,6 +103,7 @@ DisplayEnvBegin "OPENMP DISPLAY EN
DisplayEnvEnd "OPENMP DISPLAY ENVIRONMENT END"
Device "[device]"
Host "[host]"
+Tile "tile"
@@ -327,6 +328,9 @@ OBSOLETE "%1$s: over
# %1, %2 -- name and value of the overriding variable, %3 -- name of overriden variable.
AffTilesNoHWLOC "%1$s: Tiles are only supported if KMP_TOPOLOGY_METHOD=hwloc, using granularity=package instead"
AffTilesNoTiles "%1$s: Tiles requested but were not detected on this HW, using granularity=package instead"
+TopologyExtraTile "%1$s: %2$d packages x %3$d tiles/pkg x %4$d cores/tile x %5$d threads/core (%6$d total cores)"
+TopologyExtraNode "%1$s: %2$d packages x %3$d nodes/pkg x %4$d cores/node x %5$d threads/core (%6$d total cores)"
+TopologyExtraNoTi "%1$s: %2$d packages x %3$d nodes/pkg x %4$d tiles/node x %5$d cores/tile x %6$d threads/core (%7$d total cores)"
OmptOutdatedWorkshare "OMPT: Cannot determine workshare type; using the default (loop) instead. "
"This issue is fixed in an up-to-date compiler."
Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Thu Nov 30 03:51:47 2017
@@ -574,6 +574,8 @@ extern kmp_SetThreadGroupAffinity_t __km
#if KMP_USE_HWLOC
extern hwloc_topology_t __kmp_hwloc_topology;
extern int __kmp_hwloc_error;
+extern int __kmp_numa_detected;
+extern int __kmp_tile_depth;
#endif
extern size_t __kmp_affin_mask_size;
@@ -702,6 +704,8 @@ enum affinity_gran {
affinity_gran_fine = 0,
affinity_gran_thread,
affinity_gran_core,
+ affinity_gran_tile,
+ affinity_gran_numa,
affinity_gran_package,
affinity_gran_node,
#if KMP_GROUP_AFFINITY
Modified: openmp/trunk/runtime/src/kmp_affinity.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_affinity.cpp?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_affinity.cpp (original)
+++ openmp/trunk/runtime/src/kmp_affinity.cpp Thu Nov 30 03:51:47 2017
@@ -267,28 +267,62 @@ static void __kmp_affinity_print_topolog
#if KMP_USE_HWLOC
+static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
+ int depth, int *levels) {
+ int proc;
+ kmp_str_buf_t buf;
+ __kmp_str_buf_init(&buf);
+ KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
+ for (proc = 0; proc < len; proc++) {
+ __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
+ addrP[proc].first.labels[0]);
+ if (depth > 1) {
+ int level = 1; // iterate over levels
+ int label = 1; // iterate over labels
+ if (__kmp_numa_detected)
+ // node level follows package
+ if (levels[level++] > 0)
+ __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
+ addrP[proc].first.labels[label++]);
+ if (__kmp_tile_depth > 0)
+ // tile level follows node if any, or package
+ if (levels[level++] > 0)
+ __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
+ addrP[proc].first.labels[label++]);
+ if (levels[level++] > 0)
+ // core level follows
+ __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
+ addrP[proc].first.labels[label++]);
+ if (levels[level++] > 0)
+ // thread level is the latest
+ __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
+ addrP[proc].first.labels[label++]);
+ KMP_DEBUG_ASSERT(label == depth);
+ }
+ KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
+ __kmp_str_buf_clear(&buf);
+ }
+ __kmp_str_buf_free(&buf);
+}
+
+static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
+
// This function removes the topology levels that are radix 1 and don't offer
// further information about the topology. The most common example is when you
// have one thread context per core, we don't want the extra thread context
// level if it offers no unique labels. So they are removed.
// return value: the new depth of address2os
-static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os,
- int nActiveThreads, int depth,
- int *pkgLevel, int *coreLevel,
- int *threadLevel) {
+static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
+ int depth, int *levels) {
int level;
int i;
int radix1_detected;
-
- for (level = depth - 1; level >= 0; --level) {
- // Always keep the package level
- if (level == *pkgLevel)
- continue;
+ int new_depth = depth;
+ for (level = depth - 1; level > 0; --level) {
// Detect if this level is radix 1
radix1_detected = 1;
- for (i = 1; i < nActiveThreads; ++i) {
- if (address2os[0].first.labels[level] !=
- address2os[i].first.labels[level]) {
+ for (i = 1; i < nTh; ++i) {
+ if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
// There are differing label values for this level so it stays
radix1_detected = 0;
break;
@@ -297,27 +331,27 @@ static int __kmp_affinity_remove_radix_o
if (!radix1_detected)
continue;
// Radix 1 was detected
- if (level == *threadLevel) {
- // If only one thread per core, then just decrement
- // the depth which removes the threadlevel from address2os
- for (i = 0; i < nActiveThreads; ++i) {
- address2os[i].first.depth--;
- }
- *threadLevel = -1;
- } else if (level == *coreLevel) {
- // For core level, we move the thread labels over if they are still
- // valid (*threadLevel != -1), and also reduce the depth another level
- for (i = 0; i < nActiveThreads; ++i) {
- if (*threadLevel != -1) {
- address2os[i].first.labels[*coreLevel] =
- address2os[i].first.labels[*threadLevel];
+ --new_depth;
+ levels[level] = -1; // mark level as not present in address2os array
+ if (level == new_depth) {
+ // "turn off" deepest level, just decrement the depth that removes
+ // the level from address2os array
+ for (i = 0; i < nTh; ++i) {
+ addrP[i].first.depth--;
+ }
+ } else {
+ // For other levels, we move labels over and also reduce the depth
+ int j;
+ for (j = level; j < new_depth; ++j) {
+ for (i = 0; i < nTh; ++i) {
+ addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
+ addrP[i].first.depth--;
}
- address2os[i].first.depth--;
+ levels[j + 1] -= 1;
}
- *coreLevel = -1;
}
}
- return address2os[0].first.depth;
+ return new_depth;
}
// Returns the number of objects of type 'type' below 'obj' within the topology
@@ -340,8 +374,111 @@ static int __kmp_hwloc_get_nobjs_under_o
return retval;
}
+static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
+ hwloc_obj_t o, unsigned depth,
+ hwloc_obj_t *f) {
+ if (o->depth == depth) {
+ if (*f == NULL)
+ *f = o; // output first descendant found
+ return 1;
+ }
+ int sum = 0;
+ for (unsigned i = 0; i < o->arity; i++)
+ sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
+ return sum; // will be 0 if no one found (as PU arity is 0)
+}
+
+static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
+ hwloc_obj_type_t type,
+ hwloc_obj_t *f) {
+ if (!hwloc_compare_types(o->type, type)) {
+ if (*f == NULL)
+ *f = o; // output first descendant found
+ return 1;
+ }
+ int sum = 0;
+ for (unsigned i = 0; i < o->arity; i++)
+ sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
+ return sum; // will be 0 if no one found (as PU arity is 0)
+}
+
+static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
+ int &nActiveThreads,
+ int &num_active_cores,
+ hwloc_obj_t obj, int depth,
+ int *labels) {
+ hwloc_obj_t core = NULL;
+ hwloc_topology_t &tp = __kmp_hwloc_topology;
+ int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
+ for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
+ hwloc_obj_t pu = NULL;
+ KMP_DEBUG_ASSERT(core != NULL);
+ int num_active_threads = 0;
+ int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
+ // int NT = core->arity; pu = core->first_child; // faster?
+ for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
+ KMP_DEBUG_ASSERT(pu != NULL);
+ if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
+ continue; // skip inactive (inaccessible) unit
+ Address addr(depth + 2);
+ KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
+ obj->os_index, obj->logical_index, core->os_index,
+ core->logical_index, pu->os_index, pu->logical_index));
+ for (int i = 0; i < depth; ++i)
+ addr.labels[i] = labels[i]; // package, etc.
+ addr.labels[depth] = core_id; // core
+ addr.labels[depth + 1] = pu_id; // pu
+ addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
+ __kmp_pu_os_idx[nActiveThreads] = pu->os_index;
+ nActiveThreads++;
+ ++num_active_threads; // count active threads per core
+ }
+ if (num_active_threads) { // were there any active threads on the core?
+ ++__kmp_ncores; // count total active cores
+ ++num_active_cores; // count active cores per socket
+ if (num_active_threads > __kmp_nThreadsPerCore)
+ __kmp_nThreadsPerCore = num_active_threads; // calc maximum
+ }
+ }
+ return 0;
+}
+
+// Check if NUMA node detected below the package,
+// and if tile object is detected and return its depth
+static int __kmp_hwloc_check_numa() {
+ hwloc_topology_t &tp = __kmp_hwloc_topology;
+ hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
+ int depth;
+
+ // Get some PU
+ hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
+ if (hT == NULL) // something has gone wrong
+ return 1;
+
+ // check NUMA node below PACKAGE
+ hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
+ hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
+ KMP_DEBUG_ASSERT(hS != NULL);
+ if (hN != NULL && hN->depth > hS->depth) {
+ __kmp_numa_detected = TRUE; // socket includes node(s)
+ if (__kmp_affinity_gran == affinity_gran_node) {
+ __kmp_affinity_gran == affinity_gran_numa;
+ }
+ }
+
+ // check tile, get object by depth because of multiple caches possible
+ depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
+ hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
+ hC = NULL; // not used, but reset it here just in case
+ if (hL != NULL &&
+ __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
+ __kmp_tile_depth = depth; // tile consists of multiple cores
+ return 0;
+}
+
static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
kmp_i18n_id_t *const msg_id) {
+ hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
*address2os = NULL;
*msg_id = kmp_i18n_null;
@@ -349,11 +486,7 @@ static int __kmp_affinity_create_hwloc_m
kmp_affin_mask_t *oldMask;
KMP_CPU_ALLOC(oldMask);
__kmp_get_system_affinity(oldMask, TRUE);
-
- int depth = 3;
- int pkgLevel = 0;
- int coreLevel = 1;
- int threadLevel = 2;
+ __kmp_hwloc_check_numa();
if (!KMP_AFFINITY_CAPABLE()) {
// Hack to try and infer the machine topology using only the data
@@ -361,11 +494,9 @@ static int __kmp_affinity_create_hwloc_m
KMP_ASSERT(__kmp_affinity_type == affinity_none);
nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(
- hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0),
- HWLOC_OBJ_CORE);
+ hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE);
__kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(
- hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0),
- HWLOC_OBJ_PU);
+ hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
__kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
if (__kmp_affinity_verbose) {
@@ -383,9 +514,18 @@ static int __kmp_affinity_create_hwloc_m
return 0;
}
+ int depth = 3;
+ int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
+ int labels[3] = {0}; // package [,node] [,tile] - head of lables array
+ if (__kmp_numa_detected)
+ ++depth;
+ if (__kmp_tile_depth)
+ ++depth;
+
// Allocate the data structure to be returned.
AddrUnsPair *retval =
(AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+ KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
__kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
// When affinity is off, this routine will still be called to set
@@ -393,67 +533,99 @@ static int __kmp_affinity_create_hwloc_m
// nCoresPerPkg, & nPackages. Make sure all these vars are set
// correctly, and return if affinity is not enabled.
- hwloc_obj_t pu;
- hwloc_obj_t core;
- hwloc_obj_t socket;
+ hwloc_obj_t socket, node, tile;
int nActiveThreads = 0;
- int socket_identifier = 0;
+ int socket_id = 0;
// re-calculate globals to count only accessible resources
__kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
- for (socket =
- hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0);
- socket != NULL; socket = hwloc_get_next_obj_by_type(
- __kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, socket),
- socket_identifier++) {
- int core_identifier = 0;
- int num_active_cores = 0;
- for (core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type,
- socket->logical_index,
- HWLOC_OBJ_CORE, 0);
- core != NULL &&
- hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type,
- core) == socket;
- core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE,
- core),
- core_identifier++) {
- int pu_identifier = 0;
- int num_active_threads = 0;
- for (pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type,
- core->logical_index, HWLOC_OBJ_PU,
- 0);
- pu != NULL &&
- hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type,
- pu) == core;
- pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU,
- pu),
- pu_identifier++) {
- Address addr(3);
- if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
- continue; // skip inactive (inaccessible) unit
- KA_TRACE(20,
- ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
- socket->os_index, socket->logical_index, core->os_index,
- core->logical_index, pu->os_index, pu->logical_index));
- addr.labels[0] = socket_identifier; // package
- addr.labels[1] = core_identifier; // core
- addr.labels[2] = pu_identifier; // pu
- retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
- __kmp_pu_os_idx[nActiveThreads] =
- pu->os_index; // keep os index for each active pu
- nActiveThreads++;
- ++num_active_threads; // count active threads per core
- }
- if (num_active_threads) { // were there any active threads on the core?
- ++__kmp_ncores; // count total active cores
- ++num_active_cores; // count active cores per socket
- if (num_active_threads > __kmp_nThreadsPerCore)
- __kmp_nThreadsPerCore = num_active_threads; // calc maximum
- }
- }
- if (num_active_cores) { // were there any active cores on the socket?
- ++nPackages; // count total active packages
- if (num_active_cores > nCoresPerPkg)
- nCoresPerPkg = num_active_cores; // calc maximum
+ nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
+ for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
+ socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
+ socket_id++) {
+ labels[0] = socket_id;
+ if (__kmp_numa_detected) {
+ int NN;
+ int n_active_nodes = 0;
+ node = NULL;
+ NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
+ &node);
+ for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
+ labels[1] = node_id;
+ if (__kmp_tile_depth) {
+ // NUMA + tiles
+ int NT;
+ int n_active_tiles = 0;
+ tile = NULL;
+ NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
+ &tile);
+ for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
+ labels[2] = tl_id;
+ int n_active_cores = 0;
+ __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
+ n_active_cores, tile, 3, labels);
+ if (n_active_cores) { // were there any active cores on the socket?
+ ++n_active_tiles; // count active tiles per node
+ if (n_active_cores > nCorePerTile)
+ nCorePerTile = n_active_cores; // calc maximum
+ }
+ }
+ if (n_active_tiles) { // were there any active tiles on the socket?
+ ++n_active_nodes; // count active nodes per package
+ if (n_active_tiles > nTilePerNode)
+ nTilePerNode = n_active_tiles; // calc maximum
+ }
+ } else {
+ // NUMA, no tiles
+ int n_active_cores = 0;
+ __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
+ n_active_cores, node, 2, labels);
+ if (n_active_cores) { // were there any active cores on the socket?
+ ++n_active_nodes; // count active nodes per package
+ if (n_active_cores > nCorePerNode)
+ nCorePerNode = n_active_cores; // calc maximum
+ }
+ }
+ }
+ if (n_active_nodes) { // were there any active nodes on the socket?
+ ++nPackages; // count total active packages
+ if (n_active_nodes > nNodePerPkg)
+ nNodePerPkg = n_active_nodes; // calc maximum
+ }
+ } else {
+ if (__kmp_tile_depth) {
+ // no NUMA, tiles
+ int NT;
+ int n_active_tiles = 0;
+ tile = NULL;
+ NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
+ &tile);
+ for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
+ labels[1] = tl_id;
+ int n_active_cores = 0;
+ __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
+ n_active_cores, tile, 2, labels);
+ if (n_active_cores) { // were there any active cores on the socket?
+ ++n_active_tiles; // count active tiles per package
+ if (n_active_cores > nCorePerTile)
+ nCorePerTile = n_active_cores; // calc maximum
+ }
+ }
+ if (n_active_tiles) { // were there any active tiles on the socket?
+ ++nPackages; // count total active packages
+ if (n_active_tiles > nTilePerPkg)
+ nTilePerPkg = n_active_tiles; // calc maximum
+ }
+ } else {
+ // no NUMA, no tiles
+ int n_active_cores = 0;
+ __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
+ socket, 1, labels);
+ if (n_active_cores) { // were there any active cores on the socket?
+ ++nPackages; // count total active packages
+ if (n_active_cores > nCoresPerPkg)
+ nCoresPerPkg = n_active_cores; // calc maximum
+ }
+ }
}
}
@@ -487,7 +659,7 @@ static int __kmp_affinity_create_hwloc_m
// Form an Address object which only includes the package level.
Address addr(1);
- addr.labels[0] = retval[0].first.labels[pkgLevel];
+ addr.labels[0] = retval[0].first.labels[0];
retval[0].first = addr;
if (__kmp_affinity_gran_levels < 0) {
@@ -508,15 +680,26 @@ static int __kmp_affinity_create_hwloc_m
__kmp_affinity_cmp_Address_labels);
// Check to see if the machine topology is uniform
- unsigned uniform =
- (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
+ int nPUs = nPackages * __kmp_nThreadsPerCore;
+ if (__kmp_numa_detected) {
+ if (__kmp_tile_depth) { // NUMA + tiles
+ nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
+ } else { // NUMA, no tiles
+ nPUs *= (nNodePerPkg * nCorePerNode);
+ }
+ } else {
+ if (__kmp_tile_depth) { // no NUMA, tiles
+ nPUs *= (nTilePerPkg * nCorePerTile);
+ } else { // no NUMA, no tiles
+ nPUs *= nCoresPerPkg;
+ }
+ }
+ unsigned uniform = (nPUs == nActiveThreads);
// Print the machine topology summary.
if (__kmp_affinity_verbose) {
char mask[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
- KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
} else {
@@ -528,18 +711,29 @@ static int __kmp_affinity_create_hwloc_m
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
-
- kmp_str_buf_t buf;
- __kmp_str_buf_init(&buf);
-
- __kmp_str_buf_print(&buf, "%d", nPackages);
- // for (level = 1; level <= pkgLevel; level++) {
- // __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
- // }
- KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
- __kmp_nThreadsPerCore, __kmp_ncores);
-
- __kmp_str_buf_free(&buf);
+ if (__kmp_numa_detected) {
+ if (__kmp_tile_depth) { // NUMA + tiles
+ KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
+ nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
+ __kmp_ncores);
+ } else { // NUMA, no tiles
+ KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
+ nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
+ nPUs *= (nNodePerPkg * nCorePerNode);
+ }
+ } else {
+ if (__kmp_tile_depth) { // no NUMA, tiles
+ KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
+ nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
+ } else { // no NUMA, no tiles
+ kmp_str_buf_t buf;
+ __kmp_str_buf_init(&buf);
+ __kmp_str_buf_print(&buf, "%d", nPackages);
+ KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
+ __kmp_nThreadsPerCore, __kmp_ncores);
+ __kmp_str_buf_free(&buf);
+ }
+ }
}
if (__kmp_affinity_type == affinity_none) {
@@ -548,30 +742,30 @@ static int __kmp_affinity_create_hwloc_m
return 0;
}
+ int depth_full = depth; // number of levels before compressing
// Find any levels with radiix 1, and remove them from the map
// (except for the package level).
- depth = __kmp_affinity_remove_radix_one_levels(
- retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
-
+ depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
+ levels);
+ KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
if (__kmp_affinity_gran_levels < 0) {
// Set the granularity level based on what levels are modeled
// in the machine topology map.
- __kmp_affinity_gran_levels = 0;
- if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
- __kmp_affinity_gran_levels++;
- }
- if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
- __kmp_affinity_gran_levels++;
- }
- if (__kmp_affinity_gran > affinity_gran_package) {
- __kmp_affinity_gran_levels++;
+ __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
+ if (__kmp_affinity_gran > affinity_gran_thread) {
+ for (int i = 1; i <= depth_full; ++i) {
+ if (__kmp_affinity_gran <= i) // only count deeper levels
+ break;
+ if (levels[depth_full - i] > 0)
+ __kmp_affinity_gran_levels++;
+ }
}
+ if (__kmp_affinity_gran > affinity_gran_package)
+ __kmp_affinity_gran_levels++; // e.g. granularity = group
}
- if (__kmp_affinity_verbose) {
- __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
- coreLevel, threadLevel);
- }
+ if (__kmp_affinity_verbose)
+ __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
KMP_CPU_FREE(oldMask);
*address2os = retval;
@@ -1612,13 +1806,13 @@ static int __kmp_affinity_create_x2apici
continue;
}
if (level == pkgLevel) {
- newPkgLevel = level;
+ newPkgLevel = new_level;
}
if (level == coreLevel) {
- newCoreLevel = level;
+ newCoreLevel = new_level;
}
if (level == threadLevel) {
- newThreadLevel = level;
+ newThreadLevel = new_level;
}
for (proc = 0; (int)proc < nApics; proc++) {
new_retval[proc].first.labels[new_level] =
@@ -2364,11 +2558,13 @@ static kmp_affin_mask_t *__kmp_create_ma
depth = address2os[0].first.depth;
maxOsId = 0;
- for (i = 0; i < numAddrs; i++) {
+ for (i = numAddrs - 1;; --i) {
unsigned osId = address2os[i].second;
if (osId > maxOsId) {
maxOsId = osId;
}
+ if (i == 0)
+ break;
}
kmp_affin_mask_t *osId2Mask;
KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
@@ -3031,34 +3227,6 @@ void __kmp_affinity_process_placelist(km
#undef ADD_MASK_OSID
#if KMP_USE_HWLOC
-static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
- hwloc_obj_type_t type,
- hwloc_obj_t *f) {
- if (!hwloc_compare_types(o->type, type)) {
- if (*f == NULL)
- *f = o; // output first descendant found
- return 1;
- }
- int sum = 0;
- for (unsigned i = 0; i < o->arity; i++)
- sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
- return sum; // will be 0 if no one found (as PU arity is 0)
-}
-
-static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
- hwloc_obj_t o, unsigned depth,
- hwloc_obj_t *f) {
- if (o->depth == depth) {
- if (*f == NULL)
- *f = o; // output first descendant found
- return 1;
- }
- int sum = 0;
- for (unsigned i = 0; i < o->arity; i++)
- sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
- return sum; // will be 0 if no one found (as PU arity is 0)
-}
-
static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
// skip PUs descendants of the object o
int skipped = 0;
@@ -3156,6 +3324,7 @@ static void __kmp_apply_thread_places(Ad
goto _exit;
}
if (numa_support) {
+ hN = NULL;
int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
&hN); // num nodes in socket
if (__kmp_hws_node.num == 0)
@@ -3990,6 +4159,19 @@ static void __kmp_aux_affinity_initializ
}
}
+#if KMP_USE_HWLOC
+ else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
+ KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
+ if (__kmp_affinity_verbose) {
+ KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+ }
+ depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+ if (depth == 0) {
+ KMP_EXIT_AFF_NONE;
+ }
+ }
+#endif // KMP_USE_HWLOC
+
// If the user has specified that a paricular topology discovery method is to be
// used, then we abort if that method fails. The exception is group affinity,
// which might have been implicitly set.
@@ -4098,19 +4280,6 @@ static void __kmp_aux_affinity_initializ
KMP_ASSERT(address2os != NULL);
}
-#if KMP_USE_HWLOC
- else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
- KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
- if (__kmp_affinity_verbose) {
- KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
- }
- depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
- if (depth == 0) {
- KMP_EXIT_AFF_NONE;
- }
- }
-#endif // KMP_USE_HWLOC
-
if (address2os == NULL) {
if (KMP_AFFINITY_CAPABLE() &&
(__kmp_affinity_verbose ||
@@ -4122,7 +4291,11 @@ static void __kmp_aux_affinity_initializ
return;
}
- if (__kmp_affinity_gran == affinity_gran_tile && __kmp_tile_depth == 0) {
+ if (__kmp_affinity_gran == affinity_gran_tile
+#if KMP_USE_HWLOC
+ && __kmp_tile_depth == 0
+#endif
+ ) {
// tiles requested but not detected, warn user on this
KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
}
Modified: openmp/trunk/runtime/src/kmp_global.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_global.cpp?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_global.cpp (original)
+++ openmp/trunk/runtime/src/kmp_global.cpp Thu Nov 30 03:51:47 2017
@@ -243,6 +243,8 @@ KMPAffinity *__kmp_affinity_dispatch = N
#if KMP_USE_HWLOC
int __kmp_hwloc_error = FALSE;
hwloc_topology_t __kmp_hwloc_topology = NULL;
+int __kmp_numa_detected = FALSE;
+int __kmp_tile_depth = 0;
#endif
#if KMP_OS_WINDOWS
Modified: openmp/trunk/runtime/src/kmp_settings.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_settings.cpp?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_settings.cpp (original)
+++ openmp/trunk/runtime/src/kmp_settings.cpp Thu Nov 30 03:51:47 2017
@@ -2084,6 +2084,11 @@ static void __kmp_parse_affinity_env(cha
} else if (__kmp_match_str("core", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_core, -1);
buf = next;
+#if KMP_USE_HWLOC
+ } else if (__kmp_match_str("tile", buf, CCAST(const char **, &next))) {
+ set_gran(affinity_gran_tile, -1);
+ buf = next;
+#endif
} else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) {
set_gran(affinity_gran_package, -1);
buf = next;
@@ -2724,6 +2729,14 @@ static void __kmp_stg_parse_places(char
__kmp_affinity_gran = affinity_gran_core;
__kmp_affinity_dups = FALSE;
kind = "\"cores\"";
+#if KMP_USE_HWLOC
+ } else if (__kmp_match_str("tiles", scan, &next)) {
+ scan = next;
+ __kmp_affinity_type = affinity_compact;
+ __kmp_affinity_gran = affinity_gran_tile;
+ __kmp_affinity_dups = FALSE;
+ kind = "\"tiles\"";
+#endif
} else if (__kmp_match_str("sockets", scan, &next)) {
scan = next;
__kmp_affinity_type = affinity_compact;
@@ -2821,6 +2834,14 @@ static void __kmp_stg_print_places(kmp_s
} else {
__kmp_str_buf_print(buffer, "='cores'\n");
}
+#if KMP_USE_HWLOC
+ } else if (__kmp_affinity_gran == affinity_gran_tile) {
+ if (num > 0) {
+ __kmp_str_buf_print(buffer, "='tiles(%d)' \n", num);
+ } else {
+ __kmp_str_buf_print(buffer, "='tiles'\n");
+ }
+#endif
} else if (__kmp_affinity_gran == affinity_gran_package) {
if (num > 0) {
__kmp_str_buf_print(buffer, "='sockets(%d)'\n", num);
@@ -2874,6 +2895,11 @@ static void __kmp_stg_parse_topology_met
if (__kmp_str_match("all", 1, value)) {
__kmp_affinity_top_method = affinity_top_method_all;
}
+#if KMP_USE_HWLOC
+ else if (__kmp_str_match("hwloc", 1, value)) {
+ __kmp_affinity_top_method = affinity_top_method_hwloc;
+ }
+#endif
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
else if (__kmp_str_match("x2apic id", 9, value) ||
__kmp_str_match("x2apic_id", 9, value) ||
@@ -2934,13 +2960,7 @@ static void __kmp_stg_parse_topology_met
#endif /* KMP_GROUP_AFFINITY */
else if (__kmp_str_match("flat", 1, value)) {
__kmp_affinity_top_method = affinity_top_method_flat;
- }
-#if KMP_USE_HWLOC
- else if (__kmp_str_match("hwloc", 1, value)) {
- __kmp_affinity_top_method = affinity_top_method_hwloc;
- }
-#endif
- else {
+ } else {
KMP_WARNING(StgInvalidValue, name, value);
}
} // __kmp_stg_parse_topology_method
@@ -5223,6 +5243,9 @@ void __kmp_env_initialize(char const *st
case affinity_gran_node:
str = "node";
break;
+ case affinity_gran_tile:
+ str = "tile";
+ break;
default:
KMP_DEBUG_ASSERT(0);
}
More information about the Openmp-commits
mailing list