[Openmp-commits] [openmp] r272565 - Hwloc refactoring patch

Jonathan Peyton via Openmp-commits openmp-commits at lists.llvm.org
Mon Jun 13 10:30:08 PDT 2016


Author: jlpeyton
Date: Mon Jun 13 12:30:08 2016
New Revision: 272565

URL: http://llvm.org/viewvc/llvm-project?rev=272565&view=rev
Log:
Hwloc refactoring patch

These changes remove the hwloc_topology_ignore_type function which doesn't exist
in the hwloc 2.0 API. In the existing code, the topology extracted from hwloc
has the cache levels stripped out and then assumes the final stripped topology
follows the typical three-level topology: packages -> cores -> HW threads.
But the code is doing unclean manipulations to determine at what level those
resources are located and also assumes too much about what hwloc is detecting
(there could be intermediate levels in between socket and core for instance).
This new way of extracting the topology doesn't strip out any hardware objects
that hwloc detects. It does not assume the three level topology, and instead
searches for the relevant three levels within the topology for each bit of
information using hwloc interface functions. i.e., the three level topology
subset that our affinity code is interested in is extracted from the hwloc
topology tree directly.

For example, the new __kmp_hwloc_get_nobjs_under_obj function gives the user the
number of cores under a socket reliably without worrying if there are unexpected
objects between the socket object and core object in the hwloc topology
structure. Also, now that all topology information is kept, there are also
possibilities of using the caches/numa nodes to determine more sophisticated
affinity settings in the future.

There is also some cleanup code added for the destruction of the
__kmp_hwloc_topology object.

Differential Revision: http://reviews.llvm.org/D21195

Modified:
    openmp/trunk/runtime/src/kmp_affinity.cpp
    openmp/trunk/runtime/src/kmp_settings.c

Modified: openmp/trunk/runtime/src/kmp_affinity.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_affinity.cpp?rev=272565&r1=272564&r2=272565&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_affinity.cpp (original)
+++ openmp/trunk/runtime/src/kmp_affinity.cpp Mon Jun 13 12:30:08 2016
@@ -309,6 +309,72 @@ __kmp_affinity_print_topology(AddrUnsPai
 }
 
 #if KMP_USE_HWLOC
+
+// This function removes the topology levels that are radix 1 and don't offer
+// further information about the topology.  The most common example is when you
+// have one thread context per core, we don't want the extra thread context
+// level if it offers no unique labels.  So they are removed.
+// return value: the new depth of address2os
+static int
+__kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) {
+    int level;
+    int i;
+    int radix1_detected;
+
+    for (level = depth-1; level >= 0; --level) {
+        // Always keep the package level
+        if (level == *pkgLevel)
+            continue;
+        // Detect if this level is radix 1
+        radix1_detected = 1;
+        for (i = 1; i < nActiveThreads; ++i) {
+            if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) {
+                // There are differing label values for this level so it stays
+                radix1_detected = 0;
+                break;
+            }
+        }
+        if (!radix1_detected)
+            continue;
+        // Radix 1 was detected
+        if (level == *threadLevel) {
+            // If only one thread per core, then just decrement
+            // the depth which removes the threadlevel from address2os
+            for (i = 0; i < nActiveThreads; ++i) {
+                address2os[i].first.depth--;
+            }
+            *threadLevel = -1;
+        } else if (level == *coreLevel) {
+            // For core level, we move the thread labels over if they are still
+            // valid (*threadLevel != -1), and also reduce the depth another level
+            for (i = 0; i < nActiveThreads; ++i) {
+                if (*threadLevel != -1) {
+                    address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel];
+                }
+                address2os[i].first.depth--;
+            }
+            *coreLevel = -1;
+        }
+    }
+    return address2os[0].first.depth;
+}
+
+// Returns the number of objects of type 'type' below 'obj' within the topology tree structure.
+// e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then
+//  this will return the number of PU's under the SOCKET object.
+static int
+__kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) {
+    int retval = 0;
+    hwloc_obj_t first;
+    for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0);
+        first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj;
+        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first))
+    {
+        ++retval;
+    }
+    return retval;
+}
+
 static int
 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
   kmp_i18n_id_t *const msg_id)
@@ -323,38 +389,13 @@ __kmp_affinity_create_hwloc_map(AddrUnsP
     KMP_CPU_ALLOC(oldMask);
     __kmp_get_system_affinity(oldMask, TRUE);
 
-    unsigned depth = hwloc_topology_get_depth(__kmp_hwloc_topology);
-    int threadLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_PU);
-    int coreLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_CORE);
-    int pkgLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET);
-    __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 0;
-
-    //
-    // This makes an assumption about the topology being four levels:
-    // machines -> packages -> cores -> hardware threads
-    //
-    hwloc_obj_t current_level_iterator = hwloc_get_root_obj(__kmp_hwloc_topology);
-    hwloc_obj_t child_iterator;
-    for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
-        child_iterator != NULL;
-        child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
-    {
-        nPackages++;
-    }
-    current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, pkgLevel, 0);
-    for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
-        child_iterator != NULL;
-        child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
-    {
-        nCoresPerPkg++;
-    }
-    current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, coreLevel, 0);
-    for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL);
-        child_iterator != NULL;
-        child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator))
-    {
-        __kmp_nThreadsPerCore++;
-    }
+    int depth = 3;
+    int pkgLevel = 0;
+    int coreLevel = 1;
+    int threadLevel = 2;
+    nPackages = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_root_obj(__kmp_hwloc_topology), HWLOC_OBJ_SOCKET);
+    nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE);
+    __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
 
     if (! KMP_AFFINITY_CAPABLE())
     {
@@ -385,19 +426,40 @@ __kmp_affinity_create_hwloc_map(AddrUnsP
     //
     AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
 
-    unsigned num_hardware_threads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel);
-    unsigned i;
-    hwloc_obj_t hardware_thread_iterator;
+    hwloc_obj_t pu;
+    hwloc_obj_t core;
+    hwloc_obj_t socket;
     int nActiveThreads = 0;
-    for(i=0;i<num_hardware_threads;i++) {
-        hardware_thread_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, threadLevel, i);
-        Address addr(3);
-        if(! KMP_CPU_ISSET(i, fullMask)) continue;
-        addr.labels[0] = hardware_thread_iterator->parent->parent->logical_index;
-        addr.labels[1] = hardware_thread_iterator->parent->logical_index % nCoresPerPkg;
-        addr.labels[2] = hardware_thread_iterator->logical_index % __kmp_nThreadsPerCore;
-        retval[nActiveThreads] = AddrUnsPair(addr, hardware_thread_iterator->os_index);
-        nActiveThreads++;
+    int socket_identifier = 0;
+    for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0);
+        socket != NULL;
+        socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket),
+        socket_identifier++)
+    {
+        int core_identifier = 0;
+        for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0);
+            core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket;
+            core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core),
+            core_identifier++)
+        {
+            int pu_identifier = 0;
+            for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0);
+                pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core;
+                pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu),
+                pu_identifier++)
+            {
+                Address addr(3);
+                if(! KMP_CPU_ISSET(pu->os_index, fullMask))
+                    continue;
+                KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
+                    socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index));
+                addr.labels[0] = socket_identifier; // package
+                addr.labels[1] = core_identifier; // core
+                addr.labels[2] = pu_identifier; // pu
+                retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
+                nActiveThreads++;
+            }
+        }
     }
 
     //
@@ -433,7 +495,7 @@ __kmp_affinity_create_hwloc_map(AddrUnsP
         // Form an Address object which only includes the package level.
         //
         Address addr(1);
-        addr.labels[0] = retval[0].first.labels[pkgLevel-1];
+        addr.labels[0] = retval[0].first.labels[pkgLevel];
         retval[0].first = addr;
 
         if (__kmp_affinity_gran_levels < 0) {
@@ -460,14 +522,14 @@ __kmp_affinity_create_hwloc_map(AddrUnsP
     // nCoresPerPkg, & nPackages.  Make sure all these vars are set
     // correctly, and return if affinity is not enabled.
     //
-    __kmp_ncores = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, coreLevel);
+    __kmp_ncores = hwloc_get_nbobjs_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE);
 
     //
     // Check to see if the machine topology is uniform
     //
-    unsigned npackages = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, pkgLevel);
+    unsigned npackages = hwloc_get_nbobjs_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET);
     unsigned ncores = __kmp_ncores;
-    unsigned nthreads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel);
+    unsigned nthreads = hwloc_get_nbobjs_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU);
     unsigned uniform = (npackages * nCoresPerPkg * __kmp_nThreadsPerCore == nthreads);
 
     //
@@ -512,58 +574,7 @@ __kmp_affinity_create_hwloc_map(AddrUnsP
     // Find any levels with radiix 1, and remove them from the map
     // (except for the package level).
     //
-    int new_depth = 0;
-    int level;
-    unsigned proc;
-    for (level = 1; level < (int)depth; level++) {
-        if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) {
-           continue;
-        }
-        new_depth++;
-    }
-
-    //
-    // If we are removing any levels, allocate a new vector to return,
-    // and copy the relevant information to it.
-    //
-    if (new_depth != depth-1) {
-        AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
-          sizeof(AddrUnsPair) * nActiveThreads);
-        for (proc = 0; (int)proc < nActiveThreads; proc++) {
-            Address addr(new_depth);
-            new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
-        }
-        int new_level = 0;
-        for (level = 1; level < (int)depth; level++) {
-            if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) {
-               if (level == threadLevel) {
-                   threadLevel = -1;
-               }
-               else if ((threadLevel >= 0) && (level < threadLevel)) {
-                   threadLevel--;
-               }
-               if (level == coreLevel) {
-                   coreLevel = -1;
-               }
-               else if ((coreLevel >= 0) && (level < coreLevel)) {
-                   coreLevel--;
-               }
-               if (level < pkgLevel) {
-                   pkgLevel--;
-               }
-               continue;
-            }
-            for (proc = 0; (int)proc < nActiveThreads; proc++) {
-                new_retval[proc].first.labels[new_level]
-                  = retval[proc].first.labels[level];
-            }
-            new_level++;
-        }
-
-        __kmp_free(retval);
-        retval = new_retval;
-        depth = new_depth;
-    }
+    depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel);
 
     if (__kmp_affinity_gran_levels < 0) {
         //
@@ -571,10 +582,10 @@ __kmp_affinity_create_hwloc_map(AddrUnsP
         // in the machine topology map.
         //
         __kmp_affinity_gran_levels = 0;
-        if ((threadLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+        if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
             __kmp_affinity_gran_levels++;
         }
-        if ((coreLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+        if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
             __kmp_affinity_gran_levels++;
         }
         if (__kmp_affinity_gran > affinity_gran_package) {
@@ -583,14 +594,13 @@ __kmp_affinity_create_hwloc_map(AddrUnsP
     }
 
     if (__kmp_affinity_verbose) {
-        __kmp_affinity_print_topology(retval, nActiveThreads, depth-1, pkgLevel-1,
-          coreLevel-1, threadLevel-1);
+        __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel,
+          coreLevel, threadLevel);
     }
 
     KMP_CPU_FREE(oldMask);
     *address2os = retval;
-    if(depth == 0) return 0;
-    else return depth-1;
+    return depth;
 }
 #endif // KMP_USE_HWLOC
 
@@ -4051,6 +4061,12 @@ __kmp_affinity_uninitialize(void)
         __kmp_free( procarr );
         procarr = NULL;
     }
+# if KMP_USE_HWLOC
+    if (__kmp_hwloc_topology != NULL) {
+        hwloc_topology_destroy(__kmp_hwloc_topology);
+        __kmp_hwloc_topology = NULL;
+    }
+# endif
 }
 
 

Modified: openmp/trunk/runtime/src/kmp_settings.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_settings.c?rev=272565&r1=272564&r2=272565&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_settings.c (original)
+++ openmp/trunk/runtime/src/kmp_settings.c Mon Jun 13 12:30:08 2016
@@ -5294,25 +5294,18 @@ __kmp_env_initialize( char const * strin
         //
         const char *var = "KMP_AFFINITY";
 # if KMP_USE_HWLOC
-        if(hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
-            __kmp_hwloc_error = TRUE;
-            if(__kmp_affinity_verbose)
-                KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
+        if(__kmp_hwloc_topology == NULL) {
+            if(hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
+                __kmp_hwloc_error = TRUE;
+                if(__kmp_affinity_verbose)
+                    KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
+            }
+            if(hwloc_topology_load(__kmp_hwloc_topology) < 0) {
+                __kmp_hwloc_error = TRUE;
+                if(__kmp_affinity_verbose)
+                    KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
+            }
         }
-#  if HWLOC_API_VERSION >= 0x00020000
-        // new hwloc API
-        hwloc_topology_set_type_filter(__kmp_hwloc_topology, HWLOC_OBJ_L1CACHE, HWLOC_TYPE_FILTER_KEEP_NONE);
-        hwloc_topology_set_type_filter(__kmp_hwloc_topology, HWLOC_OBJ_L2CACHE, HWLOC_TYPE_FILTER_KEEP_NONE);
-        hwloc_topology_set_type_filter(__kmp_hwloc_topology, HWLOC_OBJ_L3CACHE, HWLOC_TYPE_FILTER_KEEP_NONE);
-        hwloc_topology_set_type_filter(__kmp_hwloc_topology, HWLOC_OBJ_L4CACHE, HWLOC_TYPE_FILTER_KEEP_NONE);
-        hwloc_topology_set_type_filter(__kmp_hwloc_topology, HWLOC_OBJ_L5CACHE, HWLOC_TYPE_FILTER_KEEP_NONE);
-        hwloc_topology_set_type_filter(__kmp_hwloc_topology, HWLOC_OBJ_L1ICACHE, HWLOC_TYPE_FILTER_KEEP_NONE);
-        hwloc_topology_set_type_filter(__kmp_hwloc_topology, HWLOC_OBJ_L2ICACHE, HWLOC_TYPE_FILTER_KEEP_NONE);
-        hwloc_topology_set_type_filter(__kmp_hwloc_topology, HWLOC_OBJ_L3ICACHE, HWLOC_TYPE_FILTER_KEEP_NONE);
-#  else
-        // old hwloc API
-        hwloc_topology_ignore_type(__kmp_hwloc_topology, HWLOC_OBJ_CACHE);
-#  endif
 # endif
         if ( __kmp_affinity_type == affinity_disabled ) {
             KMP_AFFINITY_DISABLE();
@@ -5320,15 +5313,10 @@ __kmp_env_initialize( char const * strin
         else if ( ! KMP_AFFINITY_CAPABLE() ) {
 # if KMP_USE_HWLOC
             const hwloc_topology_support* topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
-            if(hwloc_topology_load(__kmp_hwloc_topology) < 0) {
-                __kmp_hwloc_error = TRUE;
-                if(__kmp_affinity_verbose)
-                    KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
-            }
             // Is the system capable of setting/getting this thread's affinity?
             // also, is topology discovery possible? (pu indicates ability to discover processing units)
             // and finally, were there no errors when calling any hwloc_* API functions?
-            if(topology_support->cpubind->set_thisthread_cpubind &&
+            if(topology_support && topology_support->cpubind->set_thisthread_cpubind &&
                topology_support->cpubind->get_thisthread_cpubind &&
                topology_support->discovery->pu &&
                !__kmp_hwloc_error)




More information about the Openmp-commits mailing list