[Openmp-commits] [openmp] r319422 - Extension of HWLOC topology discovery with NUMA nodes and tiles

Andrey Churbanov via Openmp-commits openmp-commits at lists.llvm.org
Thu Nov 30 03:51:47 PST 2017


Author: achurbanov
Date: Thu Nov 30 03:51:47 2017
New Revision: 319422

URL: http://llvm.org/viewvc/llvm-project?rev=319422&view=rev
Log:
Extension of HWLOC topology discovery with NUMA nodes and tiles

Patch by Olga Malysheva

Differential Revision: https://reviews.llvm.org/D40309

Modified:
    openmp/trunk/runtime/src/i18n/en_US.txt
    openmp/trunk/runtime/src/kmp.h
    openmp/trunk/runtime/src/kmp_affinity.cpp
    openmp/trunk/runtime/src/kmp_global.cpp
    openmp/trunk/runtime/src/kmp_settings.cpp

Modified: openmp/trunk/runtime/src/i18n/en_US.txt
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/i18n/en_US.txt?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/i18n/en_US.txt (original)
+++ openmp/trunk/runtime/src/i18n/en_US.txt Thu Nov 30 03:51:47 2017
@@ -103,6 +103,7 @@ DisplayEnvBegin		     "OPENMP DISPLAY EN
 DisplayEnvEnd		     "OPENMP DISPLAY ENVIRONMENT END"
 Device			     "[device]"
 Host			     "[host]"
+Tile                         "tile"
 
 
 
@@ -327,6 +328,9 @@ OBSOLETE                     "%1$s: over
                                  # %1, %2 -- name and value of the overriding variable, %3 -- name of overriden variable.
 AffTilesNoHWLOC              "%1$s: Tiles are only supported if KMP_TOPOLOGY_METHOD=hwloc, using granularity=package instead"
 AffTilesNoTiles              "%1$s: Tiles requested but were not detected on this HW, using granularity=package instead"
+TopologyExtraTile            "%1$s: %2$d packages x %3$d tiles/pkg x %4$d cores/tile x %5$d threads/core (%6$d total cores)"
+TopologyExtraNode            "%1$s: %2$d packages x %3$d nodes/pkg x %4$d cores/node x %5$d threads/core (%6$d total cores)"
+TopologyExtraNoTi            "%1$s: %2$d packages x %3$d nodes/pkg x %4$d tiles/node x %5$d cores/tile x %6$d threads/core (%7$d total cores)"
 OmptOutdatedWorkshare        "OMPT: Cannot determine workshare type; using the default (loop) instead. "
                              "This issue is fixed in an up-to-date compiler."
 

Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Thu Nov 30 03:51:47 2017
@@ -574,6 +574,8 @@ extern kmp_SetThreadGroupAffinity_t __km
 #if KMP_USE_HWLOC
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
+extern int __kmp_numa_detected;
+extern int __kmp_tile_depth;
 #endif
 
 extern size_t __kmp_affin_mask_size;
@@ -702,6 +704,8 @@ enum affinity_gran {
   affinity_gran_fine = 0,
   affinity_gran_thread,
   affinity_gran_core,
+  affinity_gran_tile,
+  affinity_gran_numa,
   affinity_gran_package,
   affinity_gran_node,
 #if KMP_GROUP_AFFINITY

Modified: openmp/trunk/runtime/src/kmp_affinity.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_affinity.cpp?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_affinity.cpp (original)
+++ openmp/trunk/runtime/src/kmp_affinity.cpp Thu Nov 30 03:51:47 2017
@@ -267,28 +267,62 @@ static void __kmp_affinity_print_topolog
 
 #if KMP_USE_HWLOC
 
+static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
+                                          int depth, int *levels) {
+  int proc;
+  kmp_str_buf_t buf;
+  __kmp_str_buf_init(&buf);
+  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
+  for (proc = 0; proc < len; proc++) {
+    __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
+                        addrP[proc].first.labels[0]);
+    if (depth > 1) {
+      int level = 1; // iterate over levels
+      int label = 1; // iterate over labels
+      if (__kmp_numa_detected)
+        // node level follows package
+        if (levels[level++] > 0)
+          __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
+                              addrP[proc].first.labels[label++]);
+      if (__kmp_tile_depth > 0)
+        // tile level follows node if any, or package
+        if (levels[level++] > 0)
+          __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
+                              addrP[proc].first.labels[label++]);
+      if (levels[level++] > 0)
+        // core level follows
+        __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
+                            addrP[proc].first.labels[label++]);
+      if (levels[level++] > 0)
+        // thread level is the latest
+        __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
+                            addrP[proc].first.labels[label++]);
+      KMP_DEBUG_ASSERT(label == depth);
+    }
+    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
+    __kmp_str_buf_clear(&buf);
+  }
+  __kmp_str_buf_free(&buf);
+}
+
+static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
+
 // This function removes the topology levels that are radix 1 and don't offer
 // further information about the topology.  The most common example is when you
 // have one thread context per core, we don't want the extra thread context
 // level if it offers no unique labels.  So they are removed.
 // return value: the new depth of address2os
-static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os,
-                                                  int nActiveThreads, int depth,
-                                                  int *pkgLevel, int *coreLevel,
-                                                  int *threadLevel) {
+static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
+                                                  int depth, int *levels) {
   int level;
   int i;
   int radix1_detected;
-
-  for (level = depth - 1; level >= 0; --level) {
-    // Always keep the package level
-    if (level == *pkgLevel)
-      continue;
+  int new_depth = depth;
+  for (level = depth - 1; level > 0; --level) {
     // Detect if this level is radix 1
     radix1_detected = 1;
-    for (i = 1; i < nActiveThreads; ++i) {
-      if (address2os[0].first.labels[level] !=
-          address2os[i].first.labels[level]) {
+    for (i = 1; i < nTh; ++i) {
+      if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
         // There are differing label values for this level so it stays
         radix1_detected = 0;
         break;
@@ -297,27 +331,27 @@ static int __kmp_affinity_remove_radix_o
     if (!radix1_detected)
       continue;
     // Radix 1 was detected
-    if (level == *threadLevel) {
-      // If only one thread per core, then just decrement
-      // the depth which removes the threadlevel from address2os
-      for (i = 0; i < nActiveThreads; ++i) {
-        address2os[i].first.depth--;
-      }
-      *threadLevel = -1;
-    } else if (level == *coreLevel) {
-      // For core level, we move the thread labels over if they are still
-      // valid (*threadLevel != -1), and also reduce the depth another level
-      for (i = 0; i < nActiveThreads; ++i) {
-        if (*threadLevel != -1) {
-          address2os[i].first.labels[*coreLevel] =
-              address2os[i].first.labels[*threadLevel];
+    --new_depth;
+    levels[level] = -1; // mark level as not present in address2os array
+    if (level == new_depth) {
+      // "turn off" deepest level, just decrement the depth that removes
+      // the level from address2os array
+      for (i = 0; i < nTh; ++i) {
+        addrP[i].first.depth--;
+      }
+    } else {
+      // For other levels, we move labels over and also reduce the depth
+      int j;
+      for (j = level; j < new_depth; ++j) {
+        for (i = 0; i < nTh; ++i) {
+          addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
+          addrP[i].first.depth--;
         }
-        address2os[i].first.depth--;
+        levels[j + 1] -= 1;
       }
-      *coreLevel = -1;
     }
   }
-  return address2os[0].first.depth;
+  return new_depth;
 }
 
 // Returns the number of objects of type 'type' below 'obj' within the topology
@@ -340,8 +374,111 @@ static int __kmp_hwloc_get_nobjs_under_o
   return retval;
 }
 
+static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
+                                               hwloc_obj_t o, unsigned depth,
+                                               hwloc_obj_t *f) {
+  if (o->depth == depth) {
+    if (*f == NULL)
+      *f = o; // output first descendant found
+    return 1;
+  }
+  int sum = 0;
+  for (unsigned i = 0; i < o->arity; i++)
+    sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
+  return sum; // will be 0 if no one found (as PU arity is 0)
+}
+
+static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
+                                              hwloc_obj_type_t type,
+                                              hwloc_obj_t *f) {
+  if (!hwloc_compare_types(o->type, type)) {
+    if (*f == NULL)
+      *f = o; // output first descendant found
+    return 1;
+  }
+  int sum = 0;
+  for (unsigned i = 0; i < o->arity; i++)
+    sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
+  return sum; // will be 0 if no one found (as PU arity is 0)
+}
+
+static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
+                                           int &nActiveThreads,
+                                           int &num_active_cores,
+                                           hwloc_obj_t obj, int depth,
+                                           int *labels) {
+  hwloc_obj_t core = NULL;
+  hwloc_topology_t &tp = __kmp_hwloc_topology;
+  int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
+  for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
+    hwloc_obj_t pu = NULL;
+    KMP_DEBUG_ASSERT(core != NULL);
+    int num_active_threads = 0;
+    int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
+    // int NT = core->arity; pu = core->first_child; // faster?
+    for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
+      KMP_DEBUG_ASSERT(pu != NULL);
+      if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
+        continue; // skip inactive (inaccessible) unit
+      Address addr(depth + 2);
+      KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
+                    obj->os_index, obj->logical_index, core->os_index,
+                    core->logical_index, pu->os_index, pu->logical_index));
+      for (int i = 0; i < depth; ++i)
+        addr.labels[i] = labels[i]; // package, etc.
+      addr.labels[depth] = core_id; // core
+      addr.labels[depth + 1] = pu_id; // pu
+      addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
+      __kmp_pu_os_idx[nActiveThreads] = pu->os_index;
+      nActiveThreads++;
+      ++num_active_threads; // count active threads per core
+    }
+    if (num_active_threads) { // were there any active threads on the core?
+      ++__kmp_ncores; // count total active cores
+      ++num_active_cores; // count active cores per socket
+      if (num_active_threads > __kmp_nThreadsPerCore)
+        __kmp_nThreadsPerCore = num_active_threads; // calc maximum
+    }
+  }
+  return 0;
+}
+
+// Check if NUMA node detected below the package,
+// and if tile object is detected and return its depth
+static int __kmp_hwloc_check_numa() {
+  hwloc_topology_t &tp = __kmp_hwloc_topology;
+  hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
+  int depth;
+
+  // Get some PU
+  hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
+  if (hT == NULL) // something has gone wrong
+    return 1;
+
+  // check NUMA node below PACKAGE
+  hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
+  hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
+  KMP_DEBUG_ASSERT(hS != NULL);
+  if (hN != NULL && hN->depth > hS->depth) {
+    __kmp_numa_detected = TRUE; // socket includes node(s)
+    if (__kmp_affinity_gran == affinity_gran_node) {
+      __kmp_affinity_gran == affinity_gran_numa;
+    }
+  }
+
+  // check tile, get object by depth because of multiple caches possible
+  depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
+  hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
+  hC = NULL; // not used, but reset it here just in case
+  if (hL != NULL &&
+      __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
+    __kmp_tile_depth = depth; // tile consists of multiple cores
+  return 0;
+}
+
 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
                                            kmp_i18n_id_t *const msg_id) {
+  hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
   *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
@@ -349,11 +486,7 @@ static int __kmp_affinity_create_hwloc_m
   kmp_affin_mask_t *oldMask;
   KMP_CPU_ALLOC(oldMask);
   __kmp_get_system_affinity(oldMask, TRUE);
-
-  int depth = 3;
-  int pkgLevel = 0;
-  int coreLevel = 1;
-  int threadLevel = 2;
+  __kmp_hwloc_check_numa();
 
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
@@ -361,11 +494,9 @@ static int __kmp_affinity_create_hwloc_m
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
 
     nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(
-        hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0),
-        HWLOC_OBJ_CORE);
+        hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE);
     __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(
-        hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0),
-        HWLOC_OBJ_PU);
+        hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
     if (__kmp_affinity_verbose) {
@@ -383,9 +514,18 @@ static int __kmp_affinity_create_hwloc_m
     return 0;
   }
 
+  int depth = 3;
+  int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
+  int labels[3] = {0}; // package [,node] [,tile] - head of lables array
+  if (__kmp_numa_detected)
+    ++depth;
+  if (__kmp_tile_depth)
+    ++depth;
+
   // Allocate the data structure to be returned.
   AddrUnsPair *retval =
       (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
 
   // When affinity is off, this routine will still be called to set
@@ -393,67 +533,99 @@ static int __kmp_affinity_create_hwloc_m
   // nCoresPerPkg, & nPackages.  Make sure all these vars are set
   // correctly, and return if affinity is not enabled.
 
-  hwloc_obj_t pu;
-  hwloc_obj_t core;
-  hwloc_obj_t socket;
+  hwloc_obj_t socket, node, tile;
   int nActiveThreads = 0;
-  int socket_identifier = 0;
+  int socket_id = 0;
   // re-calculate globals to count only accessible resources
   __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
-  for (socket =
-           hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0);
-       socket != NULL; socket = hwloc_get_next_obj_by_type(
-                           __kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, socket),
-      socket_identifier++) {
-    int core_identifier = 0;
-    int num_active_cores = 0;
-    for (core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type,
-                                            socket->logical_index,
-                                            HWLOC_OBJ_CORE, 0);
-         core != NULL &&
-         hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type,
-                                        core) == socket;
-         core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE,
-                                           core),
-        core_identifier++) {
-      int pu_identifier = 0;
-      int num_active_threads = 0;
-      for (pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type,
-                                            core->logical_index, HWLOC_OBJ_PU,
-                                            0);
-           pu != NULL &&
-           hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type,
-                                          pu) == core;
-           pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU,
-                                           pu),
-          pu_identifier++) {
-        Address addr(3);
-        if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
-          continue; // skip inactive (inaccessible) unit
-        KA_TRACE(20,
-                 ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
-                  socket->os_index, socket->logical_index, core->os_index,
-                  core->logical_index, pu->os_index, pu->logical_index));
-        addr.labels[0] = socket_identifier; // package
-        addr.labels[1] = core_identifier; // core
-        addr.labels[2] = pu_identifier; // pu
-        retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
-        __kmp_pu_os_idx[nActiveThreads] =
-            pu->os_index; // keep os index for each active pu
-        nActiveThreads++;
-        ++num_active_threads; // count active threads per core
-      }
-      if (num_active_threads) { // were there any active threads on the core?
-        ++__kmp_ncores; // count total active cores
-        ++num_active_cores; // count active cores per socket
-        if (num_active_threads > __kmp_nThreadsPerCore)
-          __kmp_nThreadsPerCore = num_active_threads; // calc maximum
-      }
-    }
-    if (num_active_cores) { // were there any active cores on the socket?
-      ++nPackages; // count total active packages
-      if (num_active_cores > nCoresPerPkg)
-        nCoresPerPkg = num_active_cores; // calc maximum
+  nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
+  for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
+       socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
+      socket_id++) {
+    labels[0] = socket_id;
+    if (__kmp_numa_detected) {
+      int NN;
+      int n_active_nodes = 0;
+      node = NULL;
+      NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
+                                              &node);
+      for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
+        labels[1] = node_id;
+        if (__kmp_tile_depth) {
+          // NUMA + tiles
+          int NT;
+          int n_active_tiles = 0;
+          tile = NULL;
+          NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
+                                                   &tile);
+          for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
+            labels[2] = tl_id;
+            int n_active_cores = 0;
+            __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
+                                            n_active_cores, tile, 3, labels);
+            if (n_active_cores) { // were there any active cores on the socket?
+              ++n_active_tiles; // count active tiles per node
+              if (n_active_cores > nCorePerTile)
+                nCorePerTile = n_active_cores; // calc maximum
+            }
+          }
+          if (n_active_tiles) { // were there any active tiles on the socket?
+            ++n_active_nodes; // count active nodes per package
+            if (n_active_tiles > nTilePerNode)
+              nTilePerNode = n_active_tiles; // calc maximum
+          }
+        } else {
+          // NUMA, no tiles
+          int n_active_cores = 0;
+          __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
+                                          n_active_cores, node, 2, labels);
+          if (n_active_cores) { // were there any active cores on the socket?
+            ++n_active_nodes; // count active nodes per package
+            if (n_active_cores > nCorePerNode)
+              nCorePerNode = n_active_cores; // calc maximum
+          }
+        }
+      }
+      if (n_active_nodes) { // were there any active nodes on the socket?
+        ++nPackages; // count total active packages
+        if (n_active_nodes > nNodePerPkg)
+          nNodePerPkg = n_active_nodes; // calc maximum
+      }
+    } else {
+      if (__kmp_tile_depth) {
+        // no NUMA, tiles
+        int NT;
+        int n_active_tiles = 0;
+        tile = NULL;
+        NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
+                                                 &tile);
+        for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
+          labels[1] = tl_id;
+          int n_active_cores = 0;
+          __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
+                                          n_active_cores, tile, 2, labels);
+          if (n_active_cores) { // were there any active cores on the socket?
+            ++n_active_tiles; // count active tiles per package
+            if (n_active_cores > nCorePerTile)
+              nCorePerTile = n_active_cores; // calc maximum
+          }
+        }
+        if (n_active_tiles) { // were there any active tiles on the socket?
+          ++nPackages; // count total active packages
+          if (n_active_tiles > nTilePerPkg)
+            nTilePerPkg = n_active_tiles; // calc maximum
+        }
+      } else {
+        // no NUMA, no tiles
+        int n_active_cores = 0;
+        __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
+                                        socket, 1, labels);
+        if (n_active_cores) { // were there any active cores on the socket?
+          ++nPackages; // count total active packages
+          if (n_active_cores > nCoresPerPkg)
+            nCoresPerPkg = n_active_cores; // calc maximum
+        }
+      }
     }
   }
 
@@ -487,7 +659,7 @@ static int __kmp_affinity_create_hwloc_m
 
     // Form an Address object which only includes the package level.
     Address addr(1);
-    addr.labels[0] = retval[0].first.labels[pkgLevel];
+    addr.labels[0] = retval[0].first.labels[0];
     retval[0].first = addr;
 
     if (__kmp_affinity_gran_levels < 0) {
@@ -508,15 +680,26 @@ static int __kmp_affinity_create_hwloc_m
         __kmp_affinity_cmp_Address_labels);
 
   // Check to see if the machine topology is uniform
-  unsigned uniform =
-      (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads);
+  int nPUs = nPackages * __kmp_nThreadsPerCore;
+  if (__kmp_numa_detected) {
+    if (__kmp_tile_depth) { // NUMA + tiles
+      nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
+    } else { // NUMA, no tiles
+      nPUs *= (nNodePerPkg * nCorePerNode);
+    }
+  } else {
+    if (__kmp_tile_depth) { // no NUMA, tiles
+      nPUs *= (nTilePerPkg * nCorePerTile);
+    } else { // no NUMA, no tiles
+      nPUs *= nCoresPerPkg;
+    }
+  }
+  unsigned uniform = (nPUs == nActiveThreads);
 
   // Print the machine topology summary.
   if (__kmp_affinity_verbose) {
     char mask[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
-
-    KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
     if (__kmp_affinity_respect_mask) {
       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
     } else {
@@ -528,18 +711,29 @@ static int __kmp_affinity_create_hwloc_m
     } else {
       KMP_INFORM(NonUniform, "KMP_AFFINITY");
     }
-
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-
-    __kmp_str_buf_print(&buf, "%d", nPackages);
-    // for (level = 1; level <= pkgLevel; level++) {
-    //    __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
-    // }
-    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
-               __kmp_nThreadsPerCore, __kmp_ncores);
-
-    __kmp_str_buf_free(&buf);
+    if (__kmp_numa_detected) {
+      if (__kmp_tile_depth) { // NUMA + tiles
+        KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
+                   nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
+                   __kmp_ncores);
+      } else { // NUMA, no tiles
+        KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
+                   nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
+        nPUs *= (nNodePerPkg * nCorePerNode);
+      }
+    } else {
+      if (__kmp_tile_depth) { // no NUMA, tiles
+        KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
+                   nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
+      } else { // no NUMA, no tiles
+        kmp_str_buf_t buf;
+        __kmp_str_buf_init(&buf);
+        __kmp_str_buf_print(&buf, "%d", nPackages);
+        KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
+                   __kmp_nThreadsPerCore, __kmp_ncores);
+        __kmp_str_buf_free(&buf);
+      }
+    }
   }
 
   if (__kmp_affinity_type == affinity_none) {
@@ -548,30 +742,30 @@ static int __kmp_affinity_create_hwloc_m
     return 0;
   }
 
+  int depth_full = depth; // number of levels before compressing
   // Find any levels with radiix 1, and remove them from the map
   // (except for the package level).
-  depth = __kmp_affinity_remove_radix_one_levels(
-      retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
-
+  depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
+                                                 levels);
+  KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
   if (__kmp_affinity_gran_levels < 0) {
     // Set the granularity level based on what levels are modeled
     // in the machine topology map.
-    __kmp_affinity_gran_levels = 0;
-    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
-      __kmp_affinity_gran_levels++;
-    }
-    if (__kmp_affinity_gran > affinity_gran_package) {
-      __kmp_affinity_gran_levels++;
+    __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
+    if (__kmp_affinity_gran > affinity_gran_thread) {
+      for (int i = 1; i <= depth_full; ++i) {
+        if (__kmp_affinity_gran <= i) // only count deeper levels
+          break;
+        if (levels[depth_full - i] > 0)
+          __kmp_affinity_gran_levels++;
+      }
     }
+    if (__kmp_affinity_gran > affinity_gran_package)
+      __kmp_affinity_gran_levels++; // e.g. granularity = group
   }
 
-  if (__kmp_affinity_verbose) {
-    __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
-                                  coreLevel, threadLevel);
-  }
+  if (__kmp_affinity_verbose)
+    __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
 
   KMP_CPU_FREE(oldMask);
   *address2os = retval;
@@ -1612,13 +1806,13 @@ static int __kmp_affinity_create_x2apici
         continue;
       }
       if (level == pkgLevel) {
-        newPkgLevel = level;
+        newPkgLevel = new_level;
       }
       if (level == coreLevel) {
-        newCoreLevel = level;
+        newCoreLevel = new_level;
       }
       if (level == threadLevel) {
-        newThreadLevel = level;
+        newThreadLevel = new_level;
       }
       for (proc = 0; (int)proc < nApics; proc++) {
         new_retval[proc].first.labels[new_level] =
@@ -2364,11 +2558,13 @@ static kmp_affin_mask_t *__kmp_create_ma
   depth = address2os[0].first.depth;
 
   maxOsId = 0;
-  for (i = 0; i < numAddrs; i++) {
+  for (i = numAddrs - 1;; --i) {
     unsigned osId = address2os[i].second;
     if (osId > maxOsId) {
       maxOsId = osId;
     }
+    if (i == 0)
+      break;
   }
   kmp_affin_mask_t *osId2Mask;
   KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
@@ -3031,34 +3227,6 @@ void __kmp_affinity_process_placelist(km
 #undef ADD_MASK_OSID
 
 #if KMP_USE_HWLOC
-static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
-                                              hwloc_obj_type_t type,
-                                              hwloc_obj_t *f) {
-  if (!hwloc_compare_types(o->type, type)) {
-    if (*f == NULL)
-      *f = o; // output first descendant found
-    return 1;
-  }
-  int sum = 0;
-  for (unsigned i = 0; i < o->arity; i++)
-    sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
-  return sum; // will be 0 if no one found (as PU arity is 0)
-}
-
-static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
-                                               hwloc_obj_t o, unsigned depth,
-                                               hwloc_obj_t *f) {
-  if (o->depth == depth) {
-    if (*f == NULL)
-      *f = o; // output first descendant found
-    return 1;
-  }
-  int sum = 0;
-  for (unsigned i = 0; i < o->arity; i++)
-    sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
-  return sum; // will be 0 if no one found (as PU arity is 0)
-}
-
 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
   // skip PUs descendants of the object o
   int skipped = 0;
@@ -3156,6 +3324,7 @@ static void __kmp_apply_thread_places(Ad
       goto _exit;
     }
     if (numa_support) {
+      hN = NULL;
       int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
                                                   &hN); // num nodes in socket
       if (__kmp_hws_node.num == 0)
@@ -3990,6 +4159,19 @@ static void __kmp_aux_affinity_initializ
     }
   }
 
+#if KMP_USE_HWLOC
+  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
+    KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+    }
+    depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+    if (depth == 0) {
+      KMP_EXIT_AFF_NONE;
+    }
+  }
+#endif // KMP_USE_HWLOC
+
 // If the user has specified that a paricular topology discovery method is to be
 // used, then we abort if that method fails. The exception is group affinity,
 // which might have been implicitly set.
@@ -4098,19 +4280,6 @@ static void __kmp_aux_affinity_initializ
     KMP_ASSERT(address2os != NULL);
   }
 
-#if KMP_USE_HWLOC
-  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
-    KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
-    if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
-    }
-    depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
-    if (depth == 0) {
-      KMP_EXIT_AFF_NONE;
-    }
-  }
-#endif // KMP_USE_HWLOC
-
   if (address2os == NULL) {
     if (KMP_AFFINITY_CAPABLE() &&
         (__kmp_affinity_verbose ||
@@ -4122,7 +4291,11 @@ static void __kmp_aux_affinity_initializ
     return;
   }
 
-  if (__kmp_affinity_gran == affinity_gran_tile && __kmp_tile_depth == 0) {
+  if (__kmp_affinity_gran == affinity_gran_tile
+#if KMP_USE_HWLOC
+      && __kmp_tile_depth == 0
+#endif
+      ) {
     // tiles requested but not detected, warn user on this
     KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
   }

Modified: openmp/trunk/runtime/src/kmp_global.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_global.cpp?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_global.cpp (original)
+++ openmp/trunk/runtime/src/kmp_global.cpp Thu Nov 30 03:51:47 2017
@@ -243,6 +243,8 @@ KMPAffinity *__kmp_affinity_dispatch = N
 #if KMP_USE_HWLOC
 int __kmp_hwloc_error = FALSE;
 hwloc_topology_t __kmp_hwloc_topology = NULL;
+int __kmp_numa_detected = FALSE;
+int __kmp_tile_depth = 0;
 #endif
 
 #if KMP_OS_WINDOWS

Modified: openmp/trunk/runtime/src/kmp_settings.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_settings.cpp?rev=319422&r1=319421&r2=319422&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_settings.cpp (original)
+++ openmp/trunk/runtime/src/kmp_settings.cpp Thu Nov 30 03:51:47 2017
@@ -2084,6 +2084,11 @@ static void __kmp_parse_affinity_env(cha
       } else if (__kmp_match_str("core", buf, CCAST(const char **, &next))) {
         set_gran(affinity_gran_core, -1);
         buf = next;
+#if KMP_USE_HWLOC
+      } else if (__kmp_match_str("tile", buf, CCAST(const char **, &next))) {
+        set_gran(affinity_gran_tile, -1);
+        buf = next;
+#endif
       } else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) {
         set_gran(affinity_gran_package, -1);
         buf = next;
@@ -2724,6 +2729,14 @@ static void __kmp_stg_parse_places(char
     __kmp_affinity_gran = affinity_gran_core;
     __kmp_affinity_dups = FALSE;
     kind = "\"cores\"";
+#if KMP_USE_HWLOC
+  } else if (__kmp_match_str("tiles", scan, &next)) {
+    scan = next;
+    __kmp_affinity_type = affinity_compact;
+    __kmp_affinity_gran = affinity_gran_tile;
+    __kmp_affinity_dups = FALSE;
+    kind = "\"tiles\"";
+#endif
   } else if (__kmp_match_str("sockets", scan, &next)) {
     scan = next;
     __kmp_affinity_type = affinity_compact;
@@ -2821,6 +2834,14 @@ static void __kmp_stg_print_places(kmp_s
       } else {
         __kmp_str_buf_print(buffer, "='cores'\n");
       }
+#if KMP_USE_HWLOC
+    } else if (__kmp_affinity_gran == affinity_gran_tile) {
+      if (num > 0) {
+        __kmp_str_buf_print(buffer, "='tiles(%d)' \n", num);
+      } else {
+        __kmp_str_buf_print(buffer, "='tiles'\n");
+      }
+#endif
     } else if (__kmp_affinity_gran == affinity_gran_package) {
       if (num > 0) {
         __kmp_str_buf_print(buffer, "='sockets(%d)'\n", num);
@@ -2874,6 +2895,11 @@ static void __kmp_stg_parse_topology_met
   if (__kmp_str_match("all", 1, value)) {
     __kmp_affinity_top_method = affinity_top_method_all;
   }
+#if KMP_USE_HWLOC
+  else if (__kmp_str_match("hwloc", 1, value)) {
+    __kmp_affinity_top_method = affinity_top_method_hwloc;
+  }
+#endif
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   else if (__kmp_str_match("x2apic id", 9, value) ||
            __kmp_str_match("x2apic_id", 9, value) ||
@@ -2934,13 +2960,7 @@ static void __kmp_stg_parse_topology_met
 #endif /* KMP_GROUP_AFFINITY */
   else if (__kmp_str_match("flat", 1, value)) {
     __kmp_affinity_top_method = affinity_top_method_flat;
-  }
-#if KMP_USE_HWLOC
-  else if (__kmp_str_match("hwloc", 1, value)) {
-    __kmp_affinity_top_method = affinity_top_method_hwloc;
-  }
-#endif
-  else {
+  } else {
     KMP_WARNING(StgInvalidValue, name, value);
   }
 } // __kmp_stg_parse_topology_method
@@ -5223,6 +5243,9 @@ void __kmp_env_initialize(char const *st
             case affinity_gran_node:
               str = "node";
               break;
+            case affinity_gran_tile:
+              str = "tile";
+              break;
             default:
               KMP_DEBUG_ASSERT(0);
             }




More information about the Openmp-commits mailing list